Repository: crs4/pydoop Branch: develop Commit: c346870c27b2 Files: 370 Total size: 1.6 MB Directory structure: gitextract_2qljhz4z/ ├── .dir-locals.el ├── .dockerignore ├── .gitignore ├── .travis/ │ ├── check_script_template.py │ ├── cmd/ │ │ └── hadoop_localfs.sh │ ├── run_checks │ └── start_container ├── .travis.yml ├── AUTHORS ├── Dockerfile ├── Dockerfile.client ├── Dockerfile.docs ├── LICENSE ├── MANIFEST.in ├── README.md ├── VERSION ├── dev_tools/ │ ├── build_deprecation_tables │ ├── bump_copyright_year │ ├── docker/ │ │ ├── client_side_tests/ │ │ │ ├── apache_2.6.0/ │ │ │ │ ├── initialize.sh │ │ │ │ └── local_client_setup.sh │ │ │ └── hdp_2.2.0.0/ │ │ │ ├── initialize.sh │ │ │ └── local_client_setup.sh │ │ ├── cluster.rst │ │ ├── clusters/ │ │ │ └── apache_2.6.0/ │ │ │ ├── docker-compose.yml │ │ │ └── images/ │ │ │ ├── base/ │ │ │ │ ├── Dockerfile │ │ │ │ └── scripts/ │ │ │ │ ├── generate_conf_files.py │ │ │ │ ├── zk_set.py │ │ │ │ └── zk_wait.py │ │ │ ├── bootstrap/ │ │ │ │ ├── Dockerfile │ │ │ │ └── scripts/ │ │ │ │ ├── bootstrap.py │ │ │ │ └── create_hdfs_dirs.sh │ │ │ ├── datanode/ │ │ │ │ ├── Dockerfile │ │ │ │ └── scripts/ │ │ │ │ └── start_datanode.sh │ │ │ ├── historyserver/ │ │ │ │ ├── Dockerfile │ │ │ │ └── scripts/ │ │ │ │ └── start_historyserver.sh │ │ │ ├── namenode/ │ │ │ │ ├── Dockerfile │ │ │ │ └── scripts/ │ │ │ │ └── start_namenode.sh │ │ │ ├── nodemanager/ │ │ │ │ ├── Dockerfile │ │ │ │ └── scripts/ │ │ │ │ └── start_nodemanager.sh │ │ │ ├── resourcemanager/ │ │ │ │ ├── Dockerfile │ │ │ │ └── scripts/ │ │ │ │ └── start_resourcemanager.sh │ │ │ └── zookeeper/ │ │ │ ├── Dockerfile │ │ │ └── scripts/ │ │ │ └── start_namenode.sh │ │ ├── images/ │ │ │ ├── base/ │ │ │ │ └── Dockerfile │ │ │ └── client/ │ │ │ └── Dockerfile │ │ └── scripts/ │ │ ├── build_base_images.sh │ │ ├── build_cluster_images.sh │ │ ├── share_etc_hosts.py │ │ ├── start_client.sh │ │ └── start_cluster.sh │ ├── docker_build │ ├── dump_app_params │ ├── edit_conf │ ├── git_export │ ├── import_src │ ├── mapred_pipes │ ├── unpack_debian │ └── update_docs ├── docs/ │ ├── Makefile │ ├── _build/ │ │ └── .gitignore │ ├── _templates/ │ │ └── layout.html │ ├── api_docs/ │ │ ├── hadut.rst │ │ ├── hdfs_api.rst │ │ ├── index.rst │ │ └── mr_api.rst │ ├── conf.py │ ├── examples/ │ │ ├── avro.rst │ │ ├── index.rst │ │ ├── input_format.rst │ │ ├── intro.rst │ │ └── sequence_file.rst │ ├── how_to_cite.rst │ ├── index.rst │ ├── installation.rst │ ├── news/ │ │ ├── archive.rst │ │ ├── index.rst │ │ └── latest.rst │ ├── pydoop_script.rst │ ├── pydoop_script_options.rst │ ├── pydoop_submit_options.rst │ ├── running_pydoop_applications.rst │ ├── self_contained.rst │ └── tutorial/ │ ├── hdfs_api.rst │ ├── index.rst │ ├── mapred_api.rst │ └── pydoop_script.rst ├── examples/ │ ├── README │ ├── avro/ │ │ ├── build.sh │ │ ├── config.sh │ │ ├── data/ │ │ │ └── mini_aligned_seqs.gz.parquet │ │ ├── pom.xml │ │ ├── py/ │ │ │ ├── avro_base.py │ │ │ ├── avro_container_dump_results.py │ │ │ ├── avro_key_in.py │ │ │ ├── avro_key_in_out.py │ │ │ ├── avro_key_value_in.py │ │ │ ├── avro_key_value_in_out.py │ │ │ ├── avro_parquet_dump_results.py │ │ │ ├── avro_pyrw.py │ │ │ ├── avro_value_in.py │ │ │ ├── avro_value_in_out.py │ │ │ ├── check_cc.py │ │ │ ├── check_results.py │ │ │ ├── color_count.py │ │ │ ├── create_input.py │ │ │ ├── gen_data.py │ │ │ ├── generate_avro_users.py │ │ │ ├── kmer_count.py │ │ │ ├── show_kmer_count.py │ │ │ └── write_avro.py │ │ ├── run │ │ ├── run_avro_container_in │ │ ├── run_avro_container_in_out │ │ ├── run_avro_parquet_in │ │ ├── run_avro_parquet_in_out │ │ ├── run_avro_pyrw │ │ ├── run_color_count │ │ ├── run_kmer_count │ │ ├── schemas/ │ │ │ ├── alignment_record.avsc │ │ │ ├── alignment_record_proj.avsc │ │ │ ├── pet.avsc │ │ │ ├── stats.avsc │ │ │ └── user.avsc │ │ ├── src/ │ │ │ └── main/ │ │ │ └── java/ │ │ │ └── it/ │ │ │ └── crs4/ │ │ │ └── pydoop/ │ │ │ ├── WriteKV.java │ │ │ └── WriteParquet.java │ │ └── write_avro_kv │ ├── c++/ │ │ ├── HadoopPipes.cc │ │ ├── Makefile │ │ ├── README.txt │ │ ├── SerialUtils.cc │ │ ├── StringUtils.cc │ │ ├── include/ │ │ │ └── hadoop/ │ │ │ ├── Pipes.hh │ │ │ ├── SerialUtils.hh │ │ │ ├── StringUtils.hh │ │ │ └── TemplateFactory.hh │ │ └── wordcount.cc │ ├── config.sh │ ├── hdfs/ │ │ ├── common.py │ │ ├── repl_session.py │ │ ├── run │ │ ├── treegen.py │ │ └── treewalk.py │ ├── input/ │ │ ├── alice_1.txt │ │ └── alice_2.txt │ ├── input_format/ │ │ ├── check_results.py │ │ ├── it/ │ │ │ └── crs4/ │ │ │ └── pydoop/ │ │ │ ├── mapred/ │ │ │ │ └── TextInputFormat.java │ │ │ └── mapreduce/ │ │ │ └── TextInputFormat.java │ │ └── run │ ├── pydoop_script/ │ │ ├── check.py │ │ ├── data/ │ │ │ ├── base_histogram_input/ │ │ │ │ ├── example_1.sam │ │ │ │ └── example_2.sam │ │ │ ├── stop_words.txt │ │ │ └── transpose_input/ │ │ │ └── matrix.txt │ │ ├── run │ │ ├── run_script.sh │ │ └── scripts/ │ │ ├── base_histogram.py │ │ ├── caseswitch.py │ │ ├── grep.py │ │ ├── lowercase.py │ │ ├── transpose.py │ │ ├── wc_combiner.py │ │ ├── wordcount.py │ │ └── wordcount_sw.py │ ├── pydoop_submit/ │ │ ├── check.py │ │ ├── data/ │ │ │ ├── cols_1.txt │ │ │ └── cols_2.txt │ │ ├── mr/ │ │ │ ├── map_only_java_writer.py │ │ │ ├── map_only_python_writer.py │ │ │ ├── nosep.py │ │ │ ├── wordcount_full.py │ │ │ └── wordcount_minimal.py │ │ ├── run │ │ └── run_submit.sh │ ├── run_all │ ├── self_contained/ │ │ ├── check_results.py │ │ ├── run │ │ └── vowelcount/ │ │ ├── __init__.py │ │ ├── lib/ │ │ │ └── __init__.py │ │ └── mr/ │ │ ├── __init__.py │ │ ├── main.py │ │ ├── mapper.py │ │ └── reducer.py │ └── sequence_file/ │ ├── bin/ │ │ ├── filter.py │ │ └── wordcount.py │ ├── check.py │ └── run ├── int_test/ │ ├── config.sh │ ├── mapred_submitter/ │ │ ├── check.py │ │ ├── genwords.py │ │ ├── input/ │ │ │ ├── map_only/ │ │ │ │ ├── f1.txt │ │ │ │ └── f2.txt │ │ │ ├── map_reduce/ │ │ │ │ ├── f1.txt │ │ │ │ └── f2.txt │ │ │ └── map_reduce_long/ │ │ │ └── f.txt │ │ ├── mr/ │ │ │ ├── map_only_java_writer.py │ │ │ ├── map_only_python_writer.py │ │ │ ├── map_reduce_combiner.py │ │ │ ├── map_reduce_java_rw.py │ │ │ ├── map_reduce_java_rw_pstats.py │ │ │ ├── map_reduce_python_partitioner.py │ │ │ ├── map_reduce_python_reader.py │ │ │ ├── map_reduce_python_writer.py │ │ │ ├── map_reduce_raw_io.py │ │ │ ├── map_reduce_slow_java_rw.py │ │ │ └── map_reduce_slow_python_rw.py │ │ ├── run │ │ ├── run_app.sh │ │ └── run_perf.sh │ ├── opaque_split/ │ │ ├── check.py │ │ ├── gen_splits.py │ │ ├── mrapp.py │ │ └── run │ ├── progress/ │ │ ├── mrapp.py │ │ └── run │ └── run_all ├── lib/ │ └── avro-mapred-1.7.7-hadoop2.jar ├── logo/ │ └── ubuntu-font-family.tar.bz2 ├── notice_template.txt ├── pydoop/ │ ├── __init__.py │ ├── app/ │ │ ├── __init__.py │ │ ├── argparse_types.py │ │ ├── main.py │ │ ├── script.py │ │ ├── script_template.py │ │ └── submit.py │ ├── avrolib.py │ ├── hadoop_utils.py │ ├── hadut.py │ ├── hdfs/ │ │ ├── __init__.py │ │ ├── common.py │ │ ├── core/ │ │ │ └── __init__.py │ │ ├── file.py │ │ ├── fs.py │ │ └── path.py │ ├── jc.py │ ├── mapreduce/ │ │ ├── __init__.py │ │ ├── api.py │ │ ├── binary_protocol.py │ │ ├── connections.py │ │ └── pipes.py │ ├── test_support.py │ ├── test_utils.py │ └── utils/ │ ├── __init__.py │ ├── conversion_tables.py │ ├── jvm.py │ ├── misc.py │ └── py3compat.py ├── pydoop.properties ├── requirements.txt ├── setup.cfg ├── setup.py ├── src/ │ ├── Py_macros.h │ ├── buf_macros.h │ ├── it/ │ │ └── crs4/ │ │ └── pydoop/ │ │ ├── NoSeparatorTextOutputFormat.java │ │ └── mapreduce/ │ │ └── pipes/ │ │ ├── Application.java │ │ ├── BinaryProtocol.java │ │ ├── DownwardProtocol.java │ │ ├── DummyRecordReader.java │ │ ├── OpaqueSplit.java │ │ ├── OutputHandler.java │ │ ├── PipesMapper.java │ │ ├── PipesNonJavaInputFormat.java │ │ ├── PipesNonJavaOutputFormat.java │ │ ├── PipesPartitioner.java │ │ ├── PipesReducer.java │ │ ├── PydoopAvroBridgeKeyReader.java │ │ ├── PydoopAvroBridgeKeyValueReader.java │ │ ├── PydoopAvroBridgeKeyValueWriter.java │ │ ├── PydoopAvroBridgeKeyWriter.java │ │ ├── PydoopAvroBridgeReaderBase.java │ │ ├── PydoopAvroBridgeValueReader.java │ │ ├── PydoopAvroBridgeValueWriter.java │ │ ├── PydoopAvroBridgeWriterBase.java │ │ ├── PydoopAvroInputBridgeBase.java │ │ ├── PydoopAvroInputKeyBridge.java │ │ ├── PydoopAvroInputKeyValueBridge.java │ │ ├── PydoopAvroInputValueBridge.java │ │ ├── PydoopAvroKeyInputFormat.java │ │ ├── PydoopAvroKeyOutputFormat.java │ │ ├── PydoopAvroKeyRecordReader.java │ │ ├── PydoopAvroKeyRecordWriter.java │ │ ├── PydoopAvroKeyValueInputFormat.java │ │ ├── PydoopAvroKeyValueOutputFormat.java │ │ ├── PydoopAvroKeyValueRecordReader.java │ │ ├── PydoopAvroKeyValueRecordWriter.java │ │ ├── PydoopAvroOutputBridgeBase.java │ │ ├── PydoopAvroOutputFormatBase.java │ │ ├── PydoopAvroOutputKeyBridge.java │ │ ├── PydoopAvroOutputKeyValueBridge.java │ │ ├── PydoopAvroOutputValueBridge.java │ │ ├── PydoopAvroRecordReaderBase.java │ │ ├── PydoopAvroRecordWriterBase.java │ │ ├── PydoopAvroValueInputFormat.java │ │ ├── PydoopAvroValueOutputFormat.java │ │ ├── PydoopAvroValueRecordReader.java │ │ ├── PydoopAvroValueRecordWriter.java │ │ ├── Submitter.java │ │ ├── TaskLog.java │ │ ├── TaskLogAppender.java │ │ └── UpwardProtocol.java │ ├── libhdfs/ │ │ ├── common/ │ │ │ ├── htable.c │ │ │ └── htable.h │ │ ├── config.h │ │ ├── exception.c │ │ ├── exception.h │ │ ├── hdfs.c │ │ ├── include/ │ │ │ └── hdfs/ │ │ │ └── hdfs.h │ │ ├── jni_helper.c │ │ ├── jni_helper.h │ │ └── os/ │ │ ├── mutexes.h │ │ ├── posix/ │ │ │ ├── mutexes.c │ │ │ ├── platform.h │ │ │ ├── thread.c │ │ │ └── thread_local_storage.c │ │ ├── thread.h │ │ ├── thread_local_storage.h │ │ └── windows/ │ │ ├── inttypes.h │ │ ├── mutexes.c │ │ ├── platform.h │ │ ├── thread.c │ │ ├── thread_local_storage.c │ │ └── unistd.h │ ├── native_core_hdfs/ │ │ ├── hdfs_file.cc │ │ ├── hdfs_file.h │ │ ├── hdfs_fs.cc │ │ ├── hdfs_fs.h │ │ └── hdfs_module.cc │ ├── py3k_compat.h │ └── sercore/ │ ├── HadoopUtils/ │ │ ├── SerialUtils.cc │ │ └── SerialUtils.hh │ ├── hu_extras.cpp │ ├── hu_extras.h │ ├── sercore.cpp │ ├── streams.cpp │ └── streams.h └── test/ ├── __init__.py ├── all_tests.py ├── app/ │ ├── __init__.py │ ├── all_tests.py │ └── test_submit.py ├── avro/ │ ├── all_tests.py │ ├── common.py │ ├── test_io.py │ └── user.avsc ├── common/ │ ├── __init__.py │ ├── all_tests.py │ ├── test_hadoop_utils.py │ ├── test_hadut.py │ ├── test_pydoop.py │ └── test_test_support.py ├── hdfs/ │ ├── __init__.py │ ├── all_tests.py │ ├── common_hdfs_tests.py │ ├── test_common.py │ ├── test_core.py │ ├── test_hdfs.py │ ├── test_hdfs_fs.py │ ├── test_local_fs.py │ ├── test_path.py │ └── try_hdfs.py ├── mapreduce/ │ ├── __init__.py │ ├── all_tests.py │ ├── it/ │ │ └── crs4/ │ │ └── pydoop/ │ │ └── mapreduce/ │ │ └── pipes/ │ │ └── OpaqueRoundtrip.java │ ├── m_task.cmd │ ├── r_task.cmd │ ├── test_connections.py │ └── test_opaque.py └── sercore/ ├── all_tests.py ├── test_deser.py └── test_streams.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dir-locals.el ================================================ ;;; Directory Local Variables ;;; See Info node `(emacs) Directory Variables' for more information. ((python-mode (flycheck-flake8rc . "setup.cfg"))) ================================================ FILE: .dockerignore ================================================ .* Dockerfile* docker ================================================ FILE: .gitignore ================================================ *.pyc *~ build docs/_static/favicon.ico docs/_static/logo.png pydoop/config.py pydoop/version.py src/hadoop*/libhdfs/config.h src/hdfs/hdfs.xcodeproj src/hdfs/hdfs/* dist examples/**/*.class examples/**/*.jar test/timings/dataset pydoop.egg-info .DS_Store .idea *.xcodeproj ================================================ FILE: .travis/check_script_template.py ================================================ """\ Perform full substitution on the Pydoop script template and check it with flake8. Any options (i.e., arguments starting with at least a dash) are passed through to flake8. """ import sys import os import tempfile from flake8.main.cli import main as flake8_main THIS_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(THIS_DIR, os.pardir, "pydoop", "app")) from script_template import DRIVER_TEMPLATE def main(argv): code = DRIVER_TEMPLATE.substitute( module="module", map_fn="map_fn", reduce_fn="reduce_fn", combine_fn="combine_fn", combiner_wp="None", ) fd = None try: fd, fn = tempfile.mkstemp(suffix=".py", text=True) os.write(fd, code.encode("utf-8")) finally: if fd is not None: os.close(fd) flake8_argv = [fn] + [_ for _ in argv if _.startswith("-")] try: flake8_main(flake8_argv) finally: os.remove(fn) if __name__ == "__main__": argv = sys.argv[1:] if set(argv).intersection(["-h", "--help"]): print(__doc__) else: main(argv) ================================================ FILE: .travis/cmd/hadoop_localfs.sh ================================================ #!/bin/bash set -euo pipefail [ -n "${DEBUG:-}" ] && set -x function onshutdown { mr-jobhistory-daemon.sh stop historyserver yarn-daemon.sh stop nodemanager yarn-daemon.sh stop resourcemanager } trap onshutdown SIGTERM trap onshutdown SIGINT conf_dir=$(dirname $(dirname $(command -v hadoop)))/etc/hadoop cat >"${conf_dir}"/core-site.xml < EOF cat >"${conf_dir}"/hdfs-site.xml < EOF yarn-daemon.sh start resourcemanager yarn-daemon.sh start nodemanager mr-jobhistory-daemon.sh start historyserver tail -f /dev/null onshutdown ================================================ FILE: .travis/run_checks ================================================ #!/bin/bash set -euo pipefail [ -n "${DEBUG:-}" ] && set -x docker exec pydoop bash -c 'cd test && ${PYTHON} all_tests.py' docker exec pydoop bash -c 'cd test/avro && ${PYTHON} all_tests.py' docker exec -e DEBUG="${DEBUG:-}" pydoop bash -c 'cd int_test && ./run_all' docker exec -e DEBUG="${DEBUG:-}" pydoop bash -c 'cd examples && ./run_all' docker exec -e DEBUG="${DEBUG:-}" pydoop bash -c 'cd examples/avro && ./run' ================================================ FILE: .travis/start_container ================================================ #!/bin/bash set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) img=crs4/pydoop:${HADOOP_VERSION}-${TRAVIS_PYTHON_VERSION} pushd "${this_dir}" cmd_dir=$(readlink -e "cmd") pushd .. docker build . \ --build-arg hadoop_version=${HADOOP_VERSION} \ --build-arg python_version=${TRAVIS_PYTHON_VERSION} \ -t ${img} if [ -n "${LOCAL_FS:-}" ]; then docker run --rm --name pydoop -v "${cmd_dir}":/cmd:ro -d ${img} \ /cmd/hadoop_localfs.sh else docker run --rm --name pydoop -d ${img} docker exec pydoop bash -c 'until datanode_cid; do sleep 0.1; done' fi popd popd ================================================ FILE: .travis.yml ================================================ language: python cache: pip matrix: include: - python: "2.7" env: HADOOP_VERSION=3.2.0 - python: "3.6" env: HADOOP_VERSION=2.9.2 - python: "3.6" env: HADOOP_VERSION=3.2.0 - python: "3.6" env: HADOOP_VERSION=3.2.0 LOCAL_FS=true - python: "3.7" env: HADOOP_VERSION=3.2.0 dist: xenial sudo: required services: docker before_install: pip install flake8 # skip installation, requirements are handled in the Docker image install: true before_script: - flake8 -v . - python .travis/check_script_template.py -v - docker build -t crs4/pydoop-docs -f Dockerfile.docs . script: - ./.travis/start_container - ./.travis/run_checks - docker stop pydoop deploy: provider: pypi user: "${CI_USER}" password: "${CI_PASS}" on: python: "3.7" repo: crs4/pydoop tags: true ================================================ FILE: AUTHORS ================================================ Pydoop is developed and maintained by: * Simone Leo * Gianluigi Zanetti * Luca Pireddu * Francesco Cabras * Mauro Del Rio * Marco Enrico Piras Other contributors: * Cosmin Cătănoaie * Liam Slusser * Jeremy G. Kahn * Simon Li ================================================ FILE: Dockerfile ================================================ ARG hadoop_version=3.2.0 ARG python_version=3.6 FROM crs4/pydoop-base:${hadoop_version}-${python_version} COPY . /build/pydoop WORKDIR /build/pydoop RUN ${PYTHON} -m pip install --no-cache-dir --upgrade -r requirements.txt \ && ${PYTHON} setup.py sdist \ && ${PYTHON} -m pip install --pre dist/pydoop-$(cat VERSION).tar.gz ================================================ FILE: Dockerfile.client ================================================ ARG hadoop_version=3.2.0 ARG python_version=3.6 FROM crs4/pydoop-client-base:${hadoop_version}-${python_version} COPY . /build/pydoop WORKDIR /build/pydoop RUN ${PYTHON} -m pip install --no-cache-dir --upgrade -r requirements.txt \ && ${PYTHON} setup.py build \ && ${PYTHON} setup.py install --skip-build \ && ${PYTHON} setup.py clean ================================================ FILE: Dockerfile.docs ================================================ FROM crs4/pydoop-docs-base COPY . /build/pydoop WORKDIR /build/pydoop RUN ${PYTHON} -m pip install --no-cache-dir --upgrade -r requirements.txt \ && ${PYTHON} setup.py build \ && ${PYTHON} setup.py install --skip-build \ && ${PYTHON} setup.py clean \ && inkscape -z -D -f logo/logo.svg -e logo.png -w 800 2>/dev/null \ && convert -resize 200x logo.png docs/_static/logo.png \ && inkscape -z -D -f logo/favicon.svg -e 256.png -w 256 -h 256 2>/dev/null \ && for i in 16 32 64 128; do \ convert 256.png -resize ${i}x${i} ${i}.png; \ done \ && convert 16.png 32.png 64.png 128.png docs/_static/favicon.ico \ && for a in script submit; do \ ${PYTHON} dev_tools/dump_app_params --app ${a} -o docs/pydoop_${a}_options.rst; \ done \ && make SPHINXOPTS="-W" -C docs html ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: MANIFEST.in ================================================ include AUTHORS LICENSE VERSION README.md pydoop.properties requirements.txt recursive-include src * recursive-include test * recursive-include examples * recursive-include docs * recursive-include lib * ================================================ FILE: README.md ================================================ [![Build Status](https://travis-ci.org/crs4/pydoop.png)](https://travis-ci.org/crs4/pydoop) Pydoop is a Python MapReduce and HDFS API for [Hadoop](http://hadoop.apache.org/). Copyright 2009-2026 [CRS4](http://www.crs4.it/). To get started, take a look at [the docs](http://crs4.github.io/pydoop/). ================================================ FILE: VERSION ================================================ 2.0.0 ================================================ FILE: dev_tools/build_deprecation_tables ================================================ #!/usr/bin/env python """ An utility to generate mrv1 to mrv2 conversion tables. Usage:: bash$ build_deprecation_tables /opt/hadoop-2.4.1-src ./pydoop/utils/conversion_tables.py """ import os, sys, re DEFAULT_DEPRECATED_PROPERTIES_APT_VM_FNAME = \ "hadoop-common-project/hadoop-common/src/site/apt/DeprecatedProperties.apt.vm" block_separator = '||' def extract_tables(apt_vm_fname): """Returns the deprecated-to-new-property table and its inverse as two dict(s).""" with open(apt_vm_fname) as f: lines = [x for x in f.readlines() if re.match('^\|[^\|]', x)] pairs = [p for p in [map(lambda x : x.strip(), l.split('|'))[1:] for l in lines] if not p[1].startswith('NONE')] return dict(pairs), dict(( (y, x) for (x, y) in pairs)) def main(argv): src_root = argv[0] module_path = argv[1] fname = os.path.join(src_root, DEFAULT_DEPRECATED_PROPERTIES_APT_VM_FNAME) mrv1_to_mrv2, mrv2_to_mrv1 = extract_tables(fname) with open(module_path, 'w') as f: f.write('mrv1_to_mrv2=%r\n' % mrv1_to_mrv2); f.write('mrv2_to_mrv1=%r\n' % mrv2_to_mrv1); if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: dev_tools/bump_copyright_year ================================================ #!/usr/bin/env python """\ Set copyright end year across the distribution. """ import sys import os import re import argparse import datetime THIS_YEAR = datetime.date.today().year THIS_DIR = os.path.dirname(os.path.abspath(__file__)) PATTERN = re.compile(r"(?<=opyright 2009-)\d+") def find_files(root_dir): for d, subdirs, fnames in os.walk(root_dir, topdown=True): for fn in fnames: yield os.path.join(d, fn) subdirs[:] = [_ for _ in subdirs if _ != ".git"] def bump_end_year(root_dir, year): year = "%d" % year for fn in find_files(root_dir): if fn == os.path.abspath(__file__): continue print("processing %r" % (fn,)) with open(fn, "r") as f: try: content = f.read() except UnicodeDecodeError: continue with open(fn, "w") as f: f.write(re.sub(PATTERN, year, content)) def make_parser(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-y", type=int, metavar="YYYY", default=THIS_YEAR, help="copyright end year (default = current)") return parser def main(argv): parser = make_parser() args = parser.parse_args(argv[1:]) repo_root = os.path.dirname(THIS_DIR) bump_end_year(repo_root, args.y) if __name__ == "__main__": main(sys.argv) ================================================ FILE: dev_tools/docker/client_side_tests/apache_2.6.0/initialize.sh ================================================ #!/bin/bash port=$1 client_id=$2 rm_container_id=$3 DOCKER_HOST_IP=${4:-localhost} #---------------------------------- client_name=`docker exec ${client_id} hostname` #----- Upload hadoop to the client container hdp_ver=hadoop-2.6.0 hdp_tgz=${hdp_ver}.tar.gz if [[ ! -f ${hdp_tgz} ]] then hdp_url=http://mirror.nohup.it/apache/hadoop/common/${hdp_ver}/${hdp_tgz} wget ${hdp_url} -O ${hdp_tgz} fi # copy the hadoop*.tar.gz scp -P${port} ${hdp_tgz} root@${DOCKER_HOST_IP}:/opt/ # copy the installer script scp -P${port} local_client_setup.sh root@${DOCKER_HOST_IP}:. # exec and remove the installer script ssh -p${port} root@${DOCKER_HOST_IP} './local_client_setup.sh && rm local_client_setup.sh' # copy the hadoop configuration from the resourcemanager container to the client container echo "Copying hadoop config from the resourcemanager container..." for c in core-site.xml mapred-site.xml yarn-site.xml do from=/opt/hadoop/etc/hadoop/${c} to=/opt/hadoop/etc/hadoop/${c} docker exec -it ${rm_container_id} scp ${from} ${client_name}:${to} done ================================================ FILE: dev_tools/docker/client_side_tests/apache_2.6.0/local_client_setup.sh ================================================ #!/bin/bash #----------- # This script should be run in the client container. pushd /opt #----- Hadoop setup hdp_ver=hadoop-2.6.0 hdp_tgz=${hdp_ver}.tar.gz tar xzf ${hdp_tgz} ln -s ./${hdp_ver} hadoop cat < /opt/hadoop/etc/hadoop/core-site.xml fs.defaultFS hdfs://namenode:9000 EOF cat < /opt/hadoop/etc/hadoop/yarn-site.xml yarn.resourcemanager.hostname resourcemanager EOF export HADOOP_HOME=/opt/hadoop export PATH=${HADOOP_HOME}/bin:${PATH} popd #------------------ # Pydoop setup git_url=https://github.com/crs4/pydoop.git cat < /home/aen/prepare_pydoop.sh export HADOOP_HOME=/opt/hadoop git clone ${git_url} cd pydoop python setup.py build EOF cat < /home/aen/run_tests.sh export HADOOP_HOME=/opt/hadoop export PATH=\${HADOOP_HOME}/bin:\${PATH} cd pydoop/test python all_tests.py EOF cat < /home/aen/run_examples.sh export HADOOP_HOME=/opt/hadoop export PATH=\${HADOOP_HOME}/bin:\${PATH} cd pydoop/examples ./run_all EOF cat < /home/aen/run_test_jar.sh export HADOOP_HOME=/opt/hadoop export PATH=\${HADOOP_HOME}/bin:\${PATH} hdfs dfs -put run_test_jar.sh yarn jar /opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar wordcount run_test_jar.sh foobar EOF #---------------------------------------------------- # Fix bad sw versions and missing things apt-get install -y zip pip install setuptools --upgrade #su - aen -c '/bin/bash ./prepare_pydoop.sh' #su - aen -c '/bin/bash ./run_test_jar.sh' #su - aen -c '/bin/bash ./run_tests.sh' #su - aen -c '/bin/bash ./run_examples.sh' ================================================ FILE: dev_tools/docker/client_side_tests/hdp_2.2.0.0/initialize.sh ================================================ #!/bin/bash port=$1 client_id=$2 rm_container_id=$3 DOCKER_HOST_IP=${4:-localhost} #---------------------------------- client_name=`docker exec ${client_id} hostname` #---------------------------------- scp -P${port} local_client_setup.sh root@${DOCKER_HOST_IP}:. # exec and remove the installer script ssh -p${port} root@${DOCKER_HOST_IP} './local_client_setup.sh && rm local_client_setup.sh' # copy the hadoop configuration from the resourcemanager container to the client container echo "Copying hadoop config from the resourcemanager container..." for c in core-site.xml mapred-site.xml yarn-site.xml do from=/opt/hadoop/etc/hadoop/${c} to=/etc/hadoop/conf/${c} docker exec -it ${rm_container_id} scp ${from} ${client_name}:${to} done ================================================ FILE: dev_tools/docker/client_side_tests/hdp_2.2.0.0/local_client_setup.sh ================================================ #!/bin/bash # This script should be run in the client container, see initialize.sh #----------- function log() { echo "$1" } function install_hdp2_ubuntu_packages() { local VERSION="${1}" local HRTWRKS_REPO=http://public-repo-1.hortonworks.com/HDP/ubuntu12/2.x local HDP_LIST=${HRTWRKS_REPO}/GA/${VERSION}/hdp.list log "Adding repository" wget -nv ${HDP_LIST} -O /etc/apt/sources.list.d/hdp.list gpg --keyserver pgp.mit.edu --recv-keys B9733A7A07513CAD && gpg -a --export 07513CAD | apt-key add - apt-get update apt-get install -y hadoop hadoop-hdfs libhdfs0 \ hadoop-yarn hadoop-mapreduce hadoop-client \ openssl libsnappy1 libsnappy-dev } #----- Hadoop setup hdp_ver=2.2.0.0 install_hdp2_ubuntu_packages ${hdp_ver} export HADOOP_HOME=/usr/hdp/current/hadoop-client export PATH=${HADOOP_HOME}/bin:${PATH} #------------------ # Pydoop setup git_url=https://github.com/crs4/pydoop.git cat < /home/aen/prepare_pydoop.sh git clone ${git_url} cd pydoop python setup.py build EOF cat < /home/aen/run_tests.sh cd pydoop/test python all_tests.py EOF cat < /home/aen/run_examples.sh cd pydoop/examples ./run_all EOF cat < /home/aen/run_test_jar.sh hdfs dfs -put run_test_jar.sh yarn jar /usr/hdp/2.2.0.0-2041/hadoop-mapreduce/hadoop-mapreduce-examples.jar wordcount run_test_jar.sh foobar EOF #---------------------------------------------------- # Fix bad sw versions and missing things apt-get install -y zip pip install setuptools --upgrade #su - aen -c '/bin/bash ./prepare_pydoop.sh' #cd /home/aen/pydoop #python setup.py install #cd #su - aen -c '/bin/bash ./run_test_jar.sh' #su - aen -c '/bin/bash ./run_tests.sh' #su - aen -c '/bin/bash ./run_examples.sh' ================================================ FILE: dev_tools/docker/cluster.rst ================================================ Testing pydoop using a Docker Cluster ===================================== The purpose of the pydoop docker cluster is to provide a full, standard, hadoop cluster that can be used for testing purposes. This is a "real" cluster, not a pseudo-cluster single node thing. The supported testing strategy is to do the following: #. choose and start an appropriate docker cluster; #. log in the 'client' node provided by the cluster; #. install on the client node the targeted hadoop version -- it should be compatible, at the protocol level should be enough, with the cluster; #. install on the client node the pydoop version under test; #. run pydoop tests and examples. Docker cluster -------------- Build a cluster ;;;;;;;;;;;;;;; Clusters configurations are defined in subdirectories of the directory ``clusters``, e.g., ``clusters/apache_2.6.0``. Do the following to build all the cluster independent images:: $ cd clusters $ ../scripts/build_base_images.sh Next, build all the cluster dependent images:: $ ../scripts/build_cluster_images.sh apache_2.6.0 where we have used ``apache_2.6.0`` as an example. Run a cluster ;;;;;;;;;;;;; To start a cluster, do the following:: $ ../scripts/start_cluster.sh apache_2.6.0 No stopped containers Creating apache260_zookeeper_1... Creating apache260_bootstrap_1... Creating apache260_client_1... Creating apache260_namenode_1... Creating apache260_datanode_1... Creating apache260_historyserver_1... Creating apache260_resourcemanager_1... Creating apache260_nodemanager_1... The script attemps to clean up left-overs from previous runs. Thus if it is not the first time you have run it, it will ask for your permission to rm old containers:: $ ../scripts/start_cluster.sh apache_2.6.0 Stopping apache260_nodemanager_1... Stopping apache260_resourcemanager_1... Stopping apache260_historyserver_1... Stopping apache260_datanode_1... Stopping apache260_namenode_1... Stopping apache260_client_1... Stopping apache260_zookeeper_1... Going to remove apache260_nodemanager_1, apache260_resourcemanager_1, apache260_historyserver_1, apache260_client_1, apache260_datanode_1, apache260_namenode_1, apache260_bootstrap_1, apache260_zookeeper_1 Are you sure? [yN] y Removing apache260_zookeeper_1... Removing apache260_bootstrap_1... Removing apache260_client_1... Removing apache260_namenode_1... Removing apache260_datanode_1... Removing apache260_historyserver_1... Removing apache260_resourcemanager_1... Removing apache260_nodemanager_1... Moved logs to logs.backup.12522 Moved local to local.backup.12522 Creating apache260_zookeeper_1... Creating apache260_bootstrap_1... Creating apache260_client_1... Creating apache260_namenode_1... Creating apache260_datanode_1... Creating apache260_historyserver_1... Creating apache260_resourcemanager_1... Creating apache260_nodemanager_1... To check how the cluster is doing, look at the logs of the bootstrap node:: $ cd apache_2.6.0 $ docker-compose logs bootstrap Attaching to apache260_bootstrap_1 bootstrap_1 | INFO:root:Starting bootstrap. bootstrap_1 | INFO:root:Waiting for /etc/hosts to update on bootstrap bootstrap_1 | INFO:root:Waiting for /etc/hosts to update on bootstrap bootstrap_1 | .... bootstrap_1 | INFO:root:Waiting for /etc/hosts to update on bootstrap bootstrap_1 | INFO:kazoo.client:Connecting to zookeeper:2181 bootstrap_1 | INFO:kazoo.client:Zookeeper connection established, state: CONNECTED bootstrap_1 | INFO:root:Booting namenode bootstrap_1 | INFO:root: done. bootstrap_1 | INFO:root:Booting datanode bootstrap_1 | INFO:root: done. bootstrap_1 | Creating /mr-history/tmp bootstrap_1 | Creating /mr-history/done bootstrap_1 | Setting ownership (mapred:hadoop) and permissions for /mr-history bootstrap_1 | INFO:root:Booting resourcemanager bootstrap_1 | INFO:root: done. bootstrap_1 | INFO:root:Booting nodemanager bootstrap_1 | INFO:root: done. bootstrap_1 | INFO:root:Booting historyserver bootstrap_1 | INFO:root: done. bootstrap_1 | INFO:root:Done with bootstrap. apache260_bootstrap_1 exited with code 0 Then check: #. the namenode, ``http://localhost:50070``, it should be up and reporting a datanode; #. the resourcemanager, ``http://localhost:8088``, it should be up and reporting a nodemanager; #. the historyserver, ``http://localhost:19888``. How to use a docker cluster --------------------------- These are the basic steps. Change directory to ``client_side_tests``, choose a specific distribution, say ``apache_2.6.0`` and ``cd`` to that directory. Run the following command:: $ ../../scripts/start_client.sh [] The script will create a new docker container with a cluster client node that will respond to ssh connections on port ``PORT``, with 3333 as its default value. The ``start_client.sh`` script will execute the bash script ``initialize.sh``, see the provided client side tests for examples, to install on the client container the appropriate hadoop distribution, needed software, and a set of utility scripts. .. note:: You will probably have to answer twice 'yes' to ssh paranoia. Log in on the client, install pydoop and run the tests:: $ ssh -p 3333 root@localhost Linux minas-morgul 3.18.7-gentoo #1 SMP Mon Feb 23 17:39:58 PST 2015 x86_64 The programs included with the Debian GNU/Linux system are free software; the exact distribution terms for each program are described in the individual files in /usr/share/doc/*/copyright. Debian GNU/Linux comes with ABSOLUTELY NO WARRANTY, to the extent permitted by applicable law. root@client:~# su - aen -c "bash -x prepare_pydoop.sh" root@client:~# cd /home/aen/pydoop/ root@client:~# python setup.py install root@client:~# cd root@client:~# su - aen -c "bash -x run_tests.sh" root@client:~# su - aen -c "bash -x run_examples.sh" Details ------- Bootstrap strategy ;;;;;;;;;;;;;;;;; The main synchronization issues are: #. All hosts should be able to resolve logical names to IP, e.g., namenode wants to resolve datenodes' IP to their logical names #. Part of inter-services communication is handled by using shared hdfs directories that should be accessible with the appropriate permissions as a pre-condition to service firing up. The bootstrap strategy is as follows. #. There is an external mechanism -- here is the script ``../scripts/share_etc_hosts.py``, but it should really be integrated in docker-compose -- that guarantees that all nodes have in their ``/etc/hosts`` entries for all nodes in the group. We need to have an external mechanism that can talk to the docker server to be sure that we got all the nodes involved. #. We have a zookeeper node that is guaranteed to be fired before any other service by having all other nodes linked to it in the docker-compose.yml file. #. We have an auxiliary service, bootstrap, that is in charge of orchestrating the system bootstrap. #. The expected bootstrap workflow is as follows. a. docker-compose starts b. all services (except zookeeper and bootstrap) wait until ``zookeeper:/`` is set to ``boot`` c. bootstrap then does the following: 1. waits until its /etc/hosts has been changed; 2. sets ``/{namenode,datanode}`` to boot; 3. waits until namenode sets the ``/namenode`` to ``up``; 4. creates the needed hdfs dirs with appropriate permissions; 5. sets ``/{resourcemanager,nodemanager,historyserver}`` to ``boot``; 6. dies gracefully. ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/docker-compose.yml ================================================ zookeeper: image: crs4_pydoop/apache_2.6.0_zookeeper:latest name: zookeeper hostname: zookeeper ports: - "2181:2181" bootstrap: image: crs4_pydoop/apache_2.6.0_bootstrap:latest name: bootstrap hostname: bootstrap links: - zookeeper namenode: image: crs4_pydoop/apache_2.6.0_namenode:latest name: namenode hostname: namenode volumes: - ./logs:/tmp/logs links: - zookeeper ports: - "9000:9000" - "50070:50070" datanode: image: crs4_pydoop/apache_2.6.0_datanode:latest name: datanode hostname: datanode volumes_from: - namenode links: - zookeeper ports: - "50020:50020" resourcemanager: image: crs4_pydoop/apache_2.6.0_resourcemanager:latest name: resourcemanager hostname: resourcemanager volumes_from: - namenode links: - zookeeper ports: - "8088:8088" - "8021:8021" - "8031:8031" - "8033:8033" historyserver: image: crs4_pydoop/apache_2.6.0_historyserver:latest name: historyserver hostname: historyserver volumes_from: - namenode links: - zookeeper ports: - "10020:10020" - "19888:19888" nodemanager: image: crs4_pydoop/apache_2.6.0_nodemanager:latest name: nodemanager hostname: nodemanager links: - zookeeper ports: - "8042:8042" volumes_from: - namenode - client client: image: crs4_pydoop/client:latest name: client hostname: client ports: - "2222:22" volumes: - ./local:/usr/local ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/base/Dockerfile ================================================ #---------------------------------------------------- FROM crs4_pydoop/base:latest # ------------------------------------------------------------------ # Get zookeeper ENV zoo_ver zookeeper-3.4.6 ENV zoo_tgz ${zoo_ver}.tar.gz ENV zoo_site http://mirror.nohup.it/apache/zookeeper ENV zoo_tgz_site ${zoo_site}/${zoo_ver} RUN wget ${zoo_tgz_site}/${zoo_tgz} -O ${zoo_tgz} && \ mkdir -p /opt && tar -C /opt -xzf ${zoo_tgz} && rm -f ${zoo_tgz} && \ ln -s /opt/${zoo_ver} /opt/zookeeper ENV ZOO_DATA_DIR /data/zookeeper/data ENV ZOO_CLIENT_PORT 2181 EXPOSE ${ZOO_CLIENT_PORT} RUN mkdir -p ${ZOO_DATA_DIR} RUN echo "tickTime=2000" > /opt/zookeeper/conf/zoo.cfg && \ echo "dataDir ${ZOO_DATA_DIR}" >> /opt/zookeeper/conf/zoo.cfg && \ echo "clientPort ${ZOO_CLIENT_PORT}" >> /opt/zookeeper/conf/zoo.cfg && \ echo 1 > ${ZOO_DATA_DIR}/myid # Note that we are forcing the installation into dist-packages, # so that it will be possible to share kazoo and externally mount /usr/local later. RUN pip install kazoo -t /usr/lib/python2.7/dist-packages COPY scripts/zk_wait.py /tmp/ COPY scripts/zk_set.py /tmp/ # ----------------------------------------------------------------- # Get hadoop ENV hdp_ver hadoop-2.6.0 ENV hdp_tgz ${hdp_ver}.tar.gz ENV hdp_site http://mirror.nohup.it/apache/hadoop/common ENV hdp_tgz_site ${hdp_site}/hadoop-2.6.0 RUN wget ${hdp_tgz_site}/${hdp_tgz} -O ${hdp_tgz} && \ mkdir -p /opt && tar -C /opt -xzf ${hdp_tgz} && rm -f ${hdp_tgz} && \ ln -s /opt/${hdp_ver} /opt/hadoop # ------------------------------------------------------------------ # User:Group Daemons # hdfs:hadoop NameNode, Secondary NameNode, JournalNode, DataNode # yarn:hadoop ResourceManager, NodeManager # mapred:hadoop MapReduce JobHistory Server ENV HADOOP_GROUP hadoop ENV HDFS_USER hdfs ENV YARN_USER yarn ENV MAPRED_USER mapred ENV HDP_DATA_ROOT /data/hadoop ENV LOG_DIR_ROOT /tmp/logs ENV HADOOP_TMP_DIR /tmp ENV HADOOP_CONF_DIR /opt/hadoop/etc/hadoop ENV DFS_NAME_DIR ${HDP_DATA_ROOT}/hdfs/nn ENV DFS_DATA_DIR ${HDP_DATA_ROOT}/hdfs/dn ENV DFS_CHECKPOINT_DIR ${HDP_DATA_ROOT}/hdfs/snn ENV HDFS_LOG_DIR ${LOG_DIR_ROOT}/hdfs ENV HDFS_PID_DIR ${HDP_DATA_ROOT}/pid/hdfs ENV YARN_LOCAL_DIR ${HDP_DATA_ROOT}/yarn ENV YARN_LOG_DIR ${LOG_DIR_ROOT}/yarn ENV YARN_LOCAL_LOG_DIR ${YARN_LOCAL_DIR}/userlogs ENV YARN_PID_DIR ${HDP_DATA_ROOT}/pid/yarn ENV YARN_REMOTE_APP_LOG_DIR /app-logs ENV MAPRED_LOG_DIR ${LOG_DIR_ROOT}/mapred ENV MAPRED_PID_DIR ${HDP_DATA_ROOT}/pid/mapred ENV MAPRED_JH_ROOT_DIR /mr-history ENV MAPRED_JH_INTERMEDIATE_DONE_DIR ${MAPRED_JH_ROOT_DIR}/tmp ENV MAPRED_JH_DONE_DIR ${MAPRED_JH_ROOT_DIR}/done #---------------------------------------------------------- # Create groups and users RUN groupadd ${HADOOP_GROUP} && \ useradd -g ${HADOOP_GROUP} ${HDFS_USER} && \ useradd -g ${HADOOP_GROUP} ${YARN_USER} && \ useradd -g ${HADOOP_GROUP} ${MAPRED_USER} # Create DATA_DIR_ROOT RUN mkdir -p ${HDP_DATA_ROOT} && \ chmod -R 755 ${HDP_DATA_ROOT} # Create LOG_DIR_ROOT RUN mkdir -p ${LOG_DIR_ROOT} && \ chmod -R 1777 ${LOG_DIR_ROOT} RUN mkdir -p ${HADOOP_CONF_DIR} ### HDFS DIRs ########################################################### # DataNode RUN mkdir -p ${DFS_DATA_DIR} && \ chown -R ${HDFS_USER}:${HADOOP_GROUP} ${DFS_DATA_DIR} && \ chmod -R 750 ${DFS_DATA_DIR} # NameNode RUN mkdir -p ${DFS_NAME_DIR} && \ chown -R ${HDFS_USER}:${HADOOP_GROUP} ${DFS_NAME_DIR} && \ chmod -R 755 ${DFS_NAME_DIR} # HDFS log dir RUN mkdir -p ${HDFS_LOG_DIR} && \ chown -R ${HDFS_USER}:${HADOOP_GROUP} ${HDFS_LOG_DIR} && \ chmod -R 750 ${HDFS_LOG_DIR} # HDFS pid dir RUN mkdir -p ${HDFS_PID_DIR} && \ chown -R ${HDFS_USER}:${HADOOP_GROUP} ${HDFS_PID_DIR} && \ chmod -R 750 ${HDFS_PID_DIR} # RUN mkdir -p ${DFS_CHECKPOINT_DIR} && \ chown -R ${HDFS_USER}:${HADOOP_GROUP} ${DFS_CHECKPOINT_DIR} && \ chmod -R 755 ${DFS_CHECKPOINT_DIR} ### YARN DIRs ########################################################### # YARN_LOCAL_DIR RUN mkdir -p ${YARN_LOCAL_DIR} && \ chown -R ${YARN_USER}:${HADOOP_GROUP} ${YARN_LOCAL_DIR} && \ chmod -R 755 ${YARN_LOCAL_DIR} # YARN log dir RUN mkdir -p ${YARN_LOG_DIR} && \ chown -R ${YARN_USER}:${HADOOP_GROUP} ${YARN_LOG_DIR} && \ chmod -R 755 ${YARN_LOG_DIR} # YARN_LOCAL_LOG_DIR RUN mkdir -p ${YARN_LOCAL_LOG_DIR} && \ chown -R ${YARN_USER}:${HADOOP_GROUP} ${YARN_LOCAL_LOG_DIR} && \ chmod -R 755 ${YARN_LOCAL_LOG_DIR} # YARN pid dir RUN mkdir -p $YARN_PID_DIR && \ chown -R $YARN_USER:$HADOOP_GROUP $YARN_PID_DIR && \ chmod -R 755 $YARN_PID_DIR ### MAPRED DIRs ########################################################## # MAPRED log dir RUN mkdir -p $MAPRED_LOG_DIR && \ chown -R $MAPRED_USER:$HADOOP_GROUP $MAPRED_LOG_DIR && \ chmod -R 755 $MAPRED_LOG_DIR # MAPRED pid dir RUN mkdir -p $MAPRED_PID_DIR && \ chown -R $MAPRED_USER:$HADOOP_GROUP $MAPRED_PID_DIR && \ chmod -R 755 $MAPRED_PID_DIR RUN mkdir -p $ ${YARN_REMOTE_APP_LOG_DIR} && \ chown -R ${YARN_USER}:${HADOOP_GROUP} ${YARN_REMOTE_APP_LOG_DIR} && \ chmod -R 777 ${YARN_REMOTE_APP_LOG_DIR} COPY scripts/generate_conf_files.py /tmp/ RUN python2.7 /tmp/generate_conf_files.py ${HADOOP_CONF_DIR} ENV HADOOP_HOME /opt/hadoop ENV PATH ${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/base/scripts/generate_conf_files.py ================================================ import sys import os import xml.etree.cElementTree as ET def add_property(conf, name, value): prop = ET.SubElement(conf, 'property') ET.SubElement(prop, 'name').text = name ET.SubElement(prop, 'value').text = value def write_xml(root, fname): tree = ET.ElementTree(root) with open(fname, 'w') as f: f.write('\n') f.write('') tree.write(f) def generate_xml_conf_file(fname, props): root = ET.Element("configuration") for name, value in props: add_property(root, name, value) write_xml(root, fname) def generate_core_site(fname): hostname = 'namenode' generate_xml_conf_file(fname, ( ('fs.defaultFS', 'hdfs://%s:8020' % hostname), ('hadoop.tmp.dir', 'file://' + os.environ['HADOOP_TMP_DIR']) )) def generate_hdfs_site(fname): generate_xml_conf_file(fname, ( ('dfs.replication', '1'), ('dfs.namenode.name.dir', 'file://' + os.environ['DFS_NAME_DIR']), ('dfs.datanode.data.dir', 'file://' + os.environ['DFS_DATA_DIR']), ('dfs.namenode.checkpoint.dir', os.environ['DFS_CHECKPOINT_DIR']), ('dfs.namenode.checkpoint.edits.dir', os.environ['DFS_CHECKPOINT_DIR']), )) def generate_yarn_site(fname): generate_xml_conf_file(fname, ( ('yarn.resourcemanager.hostname', 'resourcemanager'), ('yarn.nodemanager.hostname', 'nodemanager'), ('yarn.nodemanager.aux-services', 'mapreduce_shuffle'), ('yarn.nodemanager.aux-services.mapreduce.shuffle.class', 'org.apache.hadoop.mapred.ShuffleHandler'), # seconds to delay before deleting application # localized logs and files. > 0 if debugging. ('yarn.nodemanager.delete.debug-delay-sec', '600'), ('yarn.nodemanager.log-dirs', 'file://' + os.environ['YARN_LOCAL_LOG_DIR']), ('yarn.log.dir', os.environ['YARN_LOG_DIR']), ('yarn.nodemanager.remote-app-log-dir', os.environ['YARN_REMOTE_APP_LOG_DIR']), ('yarn.log-aggregation-enable', 'true'), # ('yarn.log-aggregation.retain-seconds', '360000'), # ('yarn.log-aggregation.retain-check-interval-seconds', '360'), # ('yarn.log.server.url', 'http://historyserver:19888'), )) def generate_mapred_site(fname): generate_xml_conf_file(fname, ( ('mapreduce.framework.name', 'yarn'), # MRv1 ('mapreduce.jobtracker.address', 'resourcemanager:8021'), ('mapreduce.jobtracker.http.address', 'resourcemanager:50030'), ('mapreduce.tasktracker.http.address', 'nodemanager:50060'), # History Server ('mapreduce.jobhistory.address', 'historyserver:10020'), ('mapreduce.jobhistory.webapp.address', 'historyserver:19888'), ('mapreduce.jobhistory.intermediate-done-dir', os.environ['MAPRED_JH_INTERMEDIATE_DONE_DIR']), ('mapreduce.jobhistory.done-dir', os.environ['MAPRED_JH_DONE_DIR']), )) def generate_capacity_scheduler(fname): generate_xml_conf_file(fname, ( ('yarn.scheduler.capacity.resource-calculator', 'org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator'), ('yarn.scheduler.capacity.root.queues', 'default'), ('yarn.scheduler.capacity.root.default.capacity', '100'), ('yarn.scheduler.capacity.root.default.user-limit-factor', '1'), ('yarn.scheduler.capacity.root.default.maximum-capacity', '100'), ('yarn.scheduler.capacity.root.default.state', 'RUNNING'), ('yarn.scheduler.capacity.root.default.acl_submit_applications', '*'), ('yarn.scheduler.capacity.root.default.acl_administer_queue', '*'), ('yarn.scheduler.capacity.node-locality-delay', '40'))) def main(argv): target_dir = argv[1] generate_core_site(os.path.join(target_dir, 'core-site.xml')) generate_hdfs_site(os.path.join(target_dir, 'hdfs-site.xml')) generate_yarn_site(os.path.join(target_dir, 'yarn-site.xml')) generate_mapred_site(os.path.join(target_dir, 'mapred-site.xml')) generate_capacity_scheduler(os.path.join(target_dir, 'capacity-scheduler.xml')) main(sys.argv) ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/base/scripts/zk_set.py ================================================ import sys import os from kazoo.client import KazooClient import logging logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) kz = KazooClient('zookeeper', int(os.environ['ZOO_CLIENT_PORT'])) path = '/' + sys.argv[1] value = sys.argv[2] kz.start() kz.set(path, value) kz.stop() ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/base/scripts/zk_wait.py ================================================ import sys import os import time from kazoo.client import KazooClient import logging logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) host = 'zookeeper' port = int(os.environ['ZOO_CLIENT_PORT']) logger.info('Starting on %s:%d', host, port) kz = KazooClient(host, port) path = '/' + sys.argv[1] logger.info('Path is %s', path) done = False while not done: kz.start(timeout=15) done = kz.exists(path) kz.stop() time.sleep(10) ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/bootstrap/Dockerfile ================================================ #---------------------------------------------------- FROM crs4_pydoop/apache_2.6.0_base:latest COPY scripts/bootstrap.py /tmp/ COPY scripts/create_hdfs_dirs.sh /tmp/ CMD ["/usr/bin/python", "/tmp/bootstrap.py"] ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/bootstrap/scripts/bootstrap.py ================================================ from kazoo.client import KazooClient import os import time import logging import platform logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) # FIXME this will break if our name is a substring of the hosts we are linked # to. def etc_updated(): hostname = platform.node() logger.info('Waiting for /etc/hosts to update on %s', hostname) if not hostname: raise RuntimeError('hostname is undefined') with open('/etc/hosts') as f: return sum(x.find(hostname) > -1 for x in f) > 1 logger.info('\tdone') def boot_node(kz, nodename): logger.info('Booting %s', nodename) path = '/' + nodename kz.create(path, 'boot') while kz.get(path)[0] != 'up': time.sleep(2) logger.info('\tdone.') def main(): logger.info('Starting bootstrap.') zookeeper_host = 'zookeeper' zookeeper_port = int(os.environ['ZOO_CLIENT_PORT']) while not etc_updated(): time.sleep(1) kz = KazooClient(hosts='%s:%d' % (zookeeper_host, zookeeper_port)) kz.start() boot_node(kz, 'namenode') boot_node(kz, 'datanode') os.system('bash /tmp/create_hdfs_dirs.sh') boot_node(kz, 'resourcemanager') boot_node(kz, 'nodemanager') boot_node(kz, 'historyserver') logger.info('Done with bootstrap.') main() ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/bootstrap/scripts/create_hdfs_dirs.sh ================================================ #!/bin/bash export HADOOP_LOG_DIR=${HDFS_LOG_DIR} export HADOOP_PID_DIR=${HDFS_PID_DIR} HADOOP_BIN=${HADOOP_HOME}/bin # su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -mkdir -p ${YARN_REMOTE_APP_LOG_DIR}" # su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -chown -R ${YARN_USER}:${HADOOP_GROUP} ${YARN_REMOTE_APP_LOG_DIR}" # su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -chmod -R ${YARN_REMOTE_APP_LOG_DIR}" #for d in ${MAPRED_JH_DONE_DIR} ${MAPRED_JH_INTERMEDIATE_DONE_DIR} # do # su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -mkdir -p ${d}" # su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -chown -R ${MAPRED_USER}:${HADOOP_GROUP} ${d}" # su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -chmod -R 777 ${d}" # done su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -mkdir -p /tmp" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -chmod -R 1777 /tmp" echo "Creating /tmp/hadoop-yarn (owner ${MAPRED_USER}:${HADOOP_GROUP})" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -mkdir -p /tmp/hadoop-yarn/staging" #su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -mkdir -p /tmp/hadoop-yarn/staging/history/tmp" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -chown -R ${MAPRED_USER}:${HADOOP_GROUP} /tmp/hadoop-yarn" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -chmod -R 1777 /tmp/hadoop-yarn" echo "Creating ${MAPRED_JH_INTERMEDIATE_DONE_DIR}" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -mkdir -p ${MAPRED_JH_INTERMEDIATE_DONE_DIR}" echo "Creating ${MAPRED_JH_DONE_DIR}" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -mkdir -p ${MAPRED_JH_DONE_DIR}" echo "Setting ownership (${MAPRED_USER}:${HADOOP_GROUP}) and permissions for ${MAPRED_JH_ROOT_DIR}" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -chown -R ${MAPRED_USER}:${HADOOP_GROUP} ${MAPRED_JH_ROOT_DIR}" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -chmod -R 1777 ${MAPRED_JH_ROOT_DIR}" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -mkdir -p /user/${UNPRIV_USER}" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -chown ${UNPRIV_USER} /user/${UNPRIV_USER}" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -mkdir -p /user/${MAPRED_USER}" su ${HDFS_USER} -c "${HADOOP_BIN}/hdfs dfs -chown ${MAPRED_USER} /user/${MAPRED_USER}" su ${MAPRED_USER} -c "${HADOOP_BIN}/hdfs dfs -mkdir /user/${MAPRED_USER}/logs" ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/datanode/Dockerfile ================================================ #---------------------------------------------------- FROM crs4_pydoop/apache_2.6.0_base:latest # EXPOSE 50020 COPY scripts/start_datanode.sh /tmp/ CMD ["/bin/bash", "/tmp/start_datanode.sh"] ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/datanode/scripts/start_datanode.sh ================================================ #!/bin/bash #--- manage_deamon stardard export HADOOP_LOG_DIR=${HDFS_LOG_DIR} export HADOOP_PID_DIR=${HDFS_PID_DIR} python /tmp/zk_wait.py datanode su - ${HDFS_USER} -p -c "${HADOOP_HOME}/sbin/hadoop-daemon.sh --config ${HADOOP_CONF_DIR} start datanode" # FIXME python /tmp/zk_set.py datanode up echo "Log is ${HDFS_LOG_DIR}/*datanode-${HOSTNAME}.out" tail -f ${HDFS_LOG_DIR}/*datanode-${HOSTNAME}.out ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/historyserver/Dockerfile ================================================ #---------------------------------------------------- FROM crs4_pydoop/apache_2.6.0_base:latest # EXPOSE 10020 19888 COPY scripts/start_historyserver.sh /tmp/ CMD ["/bin/bash", "/tmp/start_historyserver.sh"] ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/historyserver/scripts/start_historyserver.sh ================================================ #!/bin/bash python /tmp/zk_wait.py historyserver # we should actually check that the nodemanager is up ... python /tmp/zk_set.py historyserver up export HADOOP_JHS_LOGGER=DEBUG,JSA su ${MAPRED_USER} -c "${HADOOP_HOME}/bin/mapred --config ${HADOOP_CONF_DIR} historyserver 2>&1 >/tmp/logs/historyserver.out" ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/namenode/Dockerfile ================================================ #---------------------------------------------------- FROM crs4_pydoop/apache_2.6.0_base:latest # HDFS WebUI and HDFS default port EXPOSE 50070 9000 COPY scripts/start_namenode.sh /tmp/ CMD ["/bin/bash", "/tmp/start_namenode.sh"] ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/namenode/scripts/start_namenode.sh ================================================ #!/bin/bash #--- manage_deamon stardard export HADOOP_LOG_DIR=${HDFS_LOG_DIR} export HADOOP_PID_DIR=${HDFS_PID_DIR} python /tmp/zk_wait.py namenode su -l ${HDFS_USER} -c "${HADOOP_HOME}/bin/hdfs --config ${HADOOP_CONF_DIR} namenode -format" su -l -p ${HDFS_USER} -c "${HADOOP_HOME}/sbin/hadoop-daemon.sh --config ${HADOOP_CONF_DIR} start namenode" # we should actually check that the namenode is up ... python /tmp/zk_set.py namenode up echo "log is ${HDFS_LOG_DIR}/*namenode-${HOSTNAME}.out" tail -f ${HDFS_LOG_DIR}/*namenode-${HOSTNAME}.out ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/nodemanager/Dockerfile ================================================ #---------------------------------------------------- FROM crs4_pydoop/apache_2.6.0_base:latest # EXPOSE 8042 COPY scripts/start_nodemanager.sh /tmp/ CMD ["/bin/bash", "/tmp/start_nodemanager.sh"] ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/nodemanager/scripts/start_nodemanager.sh ================================================ #!/bin/bash export YARN_LOG_DIR=${YARN_LOG_DIR} export HADOOP_PID_DIR=${HDFS_PID_DIR} python /tmp/zk_wait.py nodemanager # YARN_OPTS="$YARN_OPTS -Dhadoop.log.dir=$YARN_LOG_DIR" # YARN_OPTS="$YARN_OPTS -Dyarn.log.dir=$YARN_LOG_DIR" # YARN_OPTS="$YARN_OPTS -Dhadoop.log.file=$YARN_LOGFILE" # YARN_OPTS="$YARN_OPTS -Dyarn.log.file=$YARN_LOGFILE" # YARN_OPTS="$YARN_OPTS -Dyarn.home.dir=$YARN_COMMON_HOME" # YARN_OPTS="$YARN_OPTS -Dyarn.id.str=$YARN_IDENT_STRING" # YARN_OPTS="$YARN_OPTS -Dhadoop.root.logger=${YARN_ROOT_LOGGER:-INFO,console}" # YARN_OPTS="$YARN_OPTS -Dyarn.root.logger=${YARN_ROOT_LOGGER:-INFO,console}" su - ${YARN_USER} -p -c "${HADOOP_HOME}/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start nodemanager" # we should actually check that the nodemanager is up ... python /tmp/zk_set.py nodemanager up echo log is ${YARN_LOG_DIR}/*nodemanager-${HOSTNAME}.out tail -f ${YARN_LOG_DIR}/*nodemanager-${HOSTNAME}.out ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/resourcemanager/Dockerfile ================================================ #---------------------------------------------------- FROM crs4_pydoop/apache_2.6.0_base:latest # EXPOSE 8088 8021 8031 8032 8033 COPY scripts/start_resourcemanager.sh /tmp/ CMD ["/bin/bash", "/tmp/start_resourcemanager.sh"] ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/resourcemanager/scripts/start_resourcemanager.sh ================================================ #!/bin/bash export YARN_LOG_DIR=${YARN_LOG_DIR} export HADOOP_PID_DIR=${HDFS_PID_DIR} export YARN_OPTS='' export HADOOP_MAPRED_LOG_DIR=${YARN_LOG_DIR} # YARN_OPTS="$YARN_OPTS -Dhadoop.log.dir=$YARN_LOG_DIR" # YARN_OPTS="$YARN_OPTS -Dyarn.log.dir=$YARN_LOG_DIR" # YARN_OPTS="$YARN_OPTS -Dhadoop.log.file=$YARN_LOGFILE" # YARN_OPTS="$YARN_OPTS -Dyarn.log.file=$YARN_LOGFILE" # YARN_OPTS="$YARN_OPTS -Dyarn.home.dir=$YARN_COMMON_HOME" # YARN_OPTS="$YARN_OPTS -Dyarn.id.str=$YARN_IDENT_STRING" # YARN_OPTS="$YARN_OPTS -Dhadoop.root.logger=${YARN_ROOT_LOGGER:-INFO,console}" # YARN_OPTS="$YARN_OPTS -Dyarn.root.logger=${YARN_ROOT_LOGGER:-INFO,console}" python /tmp/zk_wait.py resourcemanager su - ${YARN_USER} -p -c "${HADOOP_HOME}/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start resourcemanager" # su - ${MAPRED_USER} -p -c "${HADOOP_HOME}/sbin/mr-jobhistory-daemon.sh --config ${HADOOP_CONF_DIR} start historyserver" # we should actually check that the resourcemanager is up ... python /tmp/zk_set.py resourcemanager up echo log is ${YARN_LOG_DIR}/*resourcemanager-${HOSTNAME}.out tail -f ${YARN_LOG_DIR}/*resourcemanager-${HOSTNAME}.out ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/zookeeper/Dockerfile ================================================ #---------------------------------------------------- FROM crs4_pydoop/apache_2.6.0_base:latest EXPOSE 2181 CMD ["/opt/zookeeper/bin/zkServer.sh", "start-foreground"] ================================================ FILE: dev_tools/docker/clusters/apache_2.6.0/images/zookeeper/scripts/start_namenode.sh ================================================ #!/bin/bash #--- manage_deamon stardard export HADOOP_LOG_DIR=${HDFS_LOG_DIR} export HADOOP_PID_DIR=${HDFS_PID_DIR} python /tmp/zk_wait.py namenode su ${HDFS_USER} -c "${HADOOP_HOME}/bin/hdfs --config ${HADOOP_CONF_DIR} namenode -format" # we should actually check that the namenode is up ... python /tmp/zk_set.py namenode up su ${HDFS_USER} -c "${HADOOP_HOME}/bin/hdfs --config ${HADOOP_CONF_DIR} namenode" ================================================ FILE: dev_tools/docker/images/base/Dockerfile ================================================ #---------------------------------------------------- # # A basic java machine with java, basic services and iv6 disabled #---------------------------------------------------- FROM debian:latest #---------------------------------------------------- # Install java and basic services RUN echo "deb http://ppa.launchpad.net/webupd8team/java/ubuntu trusty main" | tee /etc/apt/sources.list.d/webupd8team-java.list && \ echo "deb-src http://ppa.launchpad.net/webupd8team/java/ubuntu trusty main" | tee -a /etc/apt/sources.list.d/webupd8team-java.list && \ apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys EEA14886 && \ apt-get update && \ echo yes | apt-get install -y --force-yes oracle-java8-installer && \ apt-get install -y \ apt-utils \ openssh-server \ python \ python-pip \ wget ENV JAVA_HOME /usr/lib/jvm/java-8-oracle RUN echo "export JAVA_HOME=${JAVA_HOME}" >> /etc/profile.d/java.sh #---------------------------------------------------- # disable ipv6 RUN echo "net.ipv6.conf.all.disable_ipv6=1" >> /etc/sysctl.conf && \ echo "net.ipv6.conf.default.disable_ipv6=1" >> /etc/sysctl.conf && \ echo "net.ipv6.conf.lo.disable_ipv6=1" >> /etc/sysctl.conf #---------------------------------------------------- # add default unprivileged user (Alfred E. Neuman, "What? Me worry?") ENV UNPRIV_USER aen RUN useradd -m ${UNPRIV_USER} -s /bin/bash && \ echo "${UNPRIV_USER}:hadoop" | chpasswd RUN mkdir -p /root/.ssh && \ ssh-keygen -t dsa -P '' -f /root/.ssh/id_dsa && \ cat /root/.ssh/id_dsa.pub >> /root/.ssh/authorized_keys ================================================ FILE: dev_tools/docker/images/client/Dockerfile ================================================ #---------------------------------------------------- FROM crs4_pydoop/base:latest #---------------------------------- # Install useful stuff # NO update. We should be in line with base RUN apt-get install -y git build-essential python-dev #---------------------------------- # Enable sshd RUN mkdir /var/run/sshd RUN echo 'root:hadoop' | chpasswd RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config # SSH login fix. Otherwise user is kicked off after login RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd ENV NOTVISIBLE "in users profile" RUN echo "export VISIBLE=now" >> /etc/profile EXPOSE 22 #----------------------------------- CMD ["/usr/sbin/sshd", "-D"] ================================================ FILE: dev_tools/docker/scripts/build_base_images.sh ================================================ #!/bin/bash current_path=$(cd $(dirname ${BASH_SOURCE}); pwd; cd - >/dev/null) images_path="${current_path}/../images" echo "Building crs4_pydoop/base image (path: ${images_path}/base)" docker build -t crs4_pydoop/base ${images_path}/base echo "Building crs4_pydoop/client image (path: ${images_path}/client)" docker build -t crs4_pydoop/client ${images_path}/client ================================================ FILE: dev_tools/docker/scripts/build_cluster_images.sh ================================================ #!/bin/bash TAG=${1} CL_DIR=${TAG}/images for d in ${CL_DIR}/* do if [ -d ${d} -a -e ${d}/Dockerfile ]; then base=${d##${CL_DIR}/} docker build -t crs4_pydoop/${TAG}_${base} ${d} fi done exit # docker build -t crs4_pydoop/${TAG}_base ${CL_DIR}/base # docker build -t crs4_pydoop/${TAG}_zookeeper ${CL_DIR}/zookeeper # docker build -t crs4_pydoop/${TAG}_namenode ${CL_DIR}/namenode # docker build -t crs4_pydoop/${TAG}_datanode ${CL_DIR}/datanode # docker build -t crs4_pydoop/${TAG}_resourcemanager ${CL_DIR}/resourcemanager # docker build -t crs4_pydoop/${TAG}_nodemanager ${CL_DIR}/nodemanager # docker build -t crs4_pydoop/${TAG}_historyserver ${CL_DIR}/historyserver # docker build -t crs4_pydoop/${TAG}_bootstrap ${CL_DIR}/bootstrap ================================================ FILE: dev_tools/docker/scripts/share_etc_hosts.py ================================================ import os import sys import ssl import logging from docker import tls from docker import Client logging.basicConfig() logger = logging.getLogger('share_etc_hosts') logger.setLevel(logging.DEBUG) class App(object): def __init__(self, compose_group_name): self.client = docker_client() self.containers = self._get_containers(compose_group_name) def _get_containers(self, compose_group_name): head = '/%s_' % compose_group_name cs = [c for c in self.client.containers() if c['Names'][0].startswith(head)] return cs def _get_hosts(self): hosts = {} for c in self.containers: d = self.client.inspect_container(c['Id']) hosts[c['Id']] = (d['NetworkSettings']['IPAddress'], d['Config']['Hostname']) return hosts def share_etc_hosts(self): hosts = self._get_hosts() host_table = str('\n'.join(['%s\t%s' % h for h in hosts.itervalues()])) logger.debug('Host table is:\n%s', host_table) cmd = '/bin/bash -c "echo -e %r >> /etc/hosts"' % host_table for k in hosts: logger.debug('Updating %s', k) print(self.client.execute(k, cmd)) def docker_client(): """ Returns a docker-py client configured using environment variables according to the same logic as the official Docker client. """ cert_path = os.environ.get('DOCKER_CERT_PATH', '') if cert_path == '': cert_path = os.path.join(os.environ.get('HOME', ''), '.docker') base_url = os.environ.get('DOCKER_HOST') tls_config = None if os.environ.get('DOCKER_TLS_VERIFY', '') != '': parts = base_url.split('://', 1) base_url = '%s://%s' % ('https', parts[1]) client_cert = (os.path.join(cert_path, 'cert.pem'), os.path.join(cert_path, 'key.pem')) ca_cert = os.path.join(cert_path, 'ca.pem') tls_config = tls.TLSConfig( ssl_version=ssl.PROTOCOL_TLSv1, verify=True, assert_hostname=False, client_cert=client_cert, ca_cert=ca_cert, ) timeout = int(os.environ.get('DOCKER_CLIENT_TIMEOUT', 60)) return Client( base_url=base_url, tls=tls_config, version='1.15', timeout=timeout ) def main(argv): tag = argv[1].replace('.', '').replace('_', '') logger.info('Tag is:%s', tag) app = App(tag) app.share_etc_hosts() main(sys.argv) ================================================ FILE: dev_tools/docker/scripts/start_client.sh ================================================ #!/bin/bash #------------------------------------------- # # Insert a new client in a running cluster # # Usage: # $ cd client_side_tests/ # $ ../../scripts/start_client.sh # real_path=`readlink -f ${BASH_SOURCE[0]}` script_dir=`dirname ${real_path}` share_hosts_bin="python ${script_dir}/share_etc_hosts.py" client_dir=`basename $PWD` port=${1:-3333} if [[ -z "${DOCKER_HOST_IP}" ]] then echo "No explicit DOCKER_HOST_IP in your env: localhost is assumed" DOCKER_HOST_IP=localhost fi # We assume that there is only one service with that name cluster_tag=$(docker ps | grep resourcemanager | \ awk '{print $NF}'| sed -e 's/_.*$//') client_name=${cluster_tag}_client_${client_dir} docker run -d --name ${client_name} -p ${port}:22 crs4_pydoop/client:latest ${share_hosts_bin} ${cluster_tag} rm_id=$(docker ps | grep resourcemanager | awk '{print $1}') client_id=$(docker ps | grep ${client_name} | awk '{print $1}') (cat ${HOME}/.ssh/id_dsa.pub | docker exec -i ${client_id} tee -a /root/.ssh/authorized_keys) > /dev/null if [ -x ./initialize.sh ]; then ./initialize.sh ${port} ${client_id} ${rm_id} ${DOCKER_HOST_IP} fi ================================================ FILE: dev_tools/docker/scripts/start_cluster.sh ================================================ #!/bin/bash cluster_name=$1 script_dir=$(cd $(dirname ${BASH_SOURCE}); pwd; cd - >/dev/null) share_hosts_bin="python ${script_dir}/share_etc_hosts.py" cluster_path="${script_dir}/../clusters/${cluster_name}" tag=`echo ${cluster_name} | tr -d '._/'` cd ${cluster_path} docker-compose stop docker-compose rm for x in logs local do if [ -d ${x} ]; then backup=${x}.backup.$$ mv ${x} ${backup} echo "Moved ${x} to ${backup}" fi mkdir ${x} chmod 1777 ${x} done docker-compose up -d ${share_hosts_bin} ${tag} ================================================ FILE: dev_tools/docker_build ================================================ #!/usr/bin/env bash set -euo pipefail this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) pushd "${this_dir}/.." docker build --build-arg HADOOP_MAJOR_VERSION=2 -t crs4/pydoop-hadoop2 . docker build -t crs4/pydoop . docker build -t crs4/pydoop-docs -f Dockerfile.docs . popd ================================================ FILE: dev_tools/dump_app_params ================================================ #!/usr/bin/env python """ Dump app options in rst table format. """ import sys import argparse import pydoop.app.main AUTOGEN_NOTICE = """\ .. Auto-generated by %(prog)s. DO NOT EDIT! To update, run: %(prog)s --app %(app)s -o %(out_fn)s """ def set_option_attrs(actions): for a in actions: opts = a.option_strings assert len(opts) > 0 try: a.short_opt, a.long_opt = opts except ValueError: o = opts[0] assert o.startswith('-') if o.startswith('--'): a.short_opt, a.long_opt = None, o else: a.short_opt, a.long_opt = o, None def get_col_widths(actions): lengths = {} for a in actions: for n in 'short_opt', 'long_opt', 'help': attr = getattr(a, n) lengths.setdefault(n, []).append(0 if attr is None else len(attr)) widths = dict((k, max(v)) for k, v in lengths.items()) # add 4 for ``backticks`` for n in 'short_opt', 'long_opt': widths[n] += 4 return widths class Formatter(object): NAMES = 'short_opt', 'long_opt', 'help' def __init__(self, actions): self.col_widths = get_col_widths(actions) self.actions = actions def format_line(self, fields): ln = [f.ljust(self.col_widths[n]) for f, n in zip(fields, self.NAMES)] return '| %s |' % ' | '.join(ln) def format_action(self, action): ln = [] for n in 'short_opt', 'long_opt': opt = getattr(action, n) ln.append('``%s``' % opt if opt else '') ln.append(getattr(action, 'help')) return self.format_line(ln) def hline(self, filler='-'): ln = [] for n in self.NAMES: ln.append(filler * self.col_widths[n]) return '+{0}{1}{0}+'.format( filler, '{0}+{0}'.format(filler).join(ln) ) def header_lines(self): lines = [self.hline()] lines.append(self.format_line(['Short', 'Long', 'Meaning'])) lines.append(self.hline(filler='=')) return lines def dump_table(self, outf, exclude_h=True): for ln in self.header_lines(): outf.write(ln + '\n') for a in self.actions: if exclude_h and a.short_opt == '-h': continue outf.write(self.format_action(a) + '\n') outf.write(self.hline() + '\n') def make_parser(): parser = argparse.ArgumentParser(description='dump pydoop app help') parser.add_argument('-o', '--out-fn', metavar='FILE', help='output file') parser.add_argument('--app', metavar='PYDOOP_APP_NAME', default='script') return parser def main(): parser = make_parser() args = parser.parse_args() outf = None pydoop_parser = pydoop.app.main.make_parser() subp = pydoop_parser._pydoop_docs_helper[args.app] act_map = dict((_.title, _._group_actions) for _ in subp._action_groups) actions = act_map['optional arguments'] set_option_attrs(actions) fmt = Formatter(actions) try: outf = open(args.out_fn, 'w') if args.out_fn else sys.stdout outf.write(AUTOGEN_NOTICE % { 'prog': sys.argv[0], 'app': args.app, 'out_fn': args.out_fn }) fmt.dump_table(outf) finally: if outf: outf.close() if __name__ == '__main__': main() ================================================ FILE: dev_tools/edit_conf ================================================ #!/usr/bin/env python """\ A utility to edit hadoop configuration files. Usage:: $ edit_conf conf/yarn-site.xml tmp.xml \ yarn.nodemanager.resource.cpu-vcores 2 \ yarn.nodemanager.resource.memory-mb 1024 """ from lxml import etree as ET import sys def doc_to_dict(doc): props = {} root = doc.getroot() for p in root.findall('property'): props[p.find('name').text] = p.find('value').text return props def dict_to_doc(props): doc = ET.ElementTree(ET.fromstring('')) root = doc.getroot() pi = ET.ProcessingInstruction( 'xml-stylesheet', 'type="text/xsl" href="configuration.xsl"') root.addprevious(pi) for k in props: p = ET.SubElement(root, "property") name = ET.SubElement(p, "name") val = ET.SubElement(p, "value") name.text, val.text = k, props[k] return doc def main(argv): assert len(argv) >= 2 and not (len(argv) & 0x01) conf_input = argv[0] conf_output = argv[1] doc = ET.parse(conf_input) props = doc_to_dict(doc) ai = iter(argv[2:]) for k, v in zip(ai, ai): props[k] = v ndoc = dict_to_doc(props) with open(conf_output, 'wb') as f: f.write(ET.tostring( ndoc, encoding="utf-8", xml_declaration=True, pretty_print=True )) if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: dev_tools/git_export ================================================ #!/usr/bin/env python """ Export git working copy including uncommitted changes """ import sys import os import argparse import shutil import subprocess as sp THIS_DIR = os.path.dirname(os.path.abspath(__file__)) PARENT_DIR = os.path.dirname(THIS_DIR) DEFAULT_EXPORT_DIR = os.path.join(PARENT_DIR, "git_export") def get_sources(): cmd = "git ls-files --full-name %s" % PARENT_DIR return sp.check_output(cmd, shell=True).splitlines() def export(sources, export_root): if os.path.isdir(export_root): shutil.rmtree(export_root) os.makedirs(export_root) for fn in sources: d, bn = os.path.split(fn) if bn.startswith(".git"): print "skipping", fn continue d = os.path.join(export_root, d) if not os.path.isdir(d): os.makedirs(d) in_path = os.path.join(PARENT_DIR, fn) if os.path.islink(in_path): in_path = os.path.realpath(in_path) out_path = os.path.join(d, bn) if os.path.isdir(in_path): shutil.copytree(in_path, out_path, symlinks=True) else: shutil.copy(in_path, out_path) else: shutil.copy(in_path, d) def make_parser(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-o", "--output-dir", metavar="DIR", help="output directory", default=DEFAULT_EXPORT_DIR) return parser def main(argv): parser = make_parser() args = parser.parse_args(argv[1:]) sources = get_sources() export(sources, args.output_dir) if __name__ == "__main__": main(sys.argv) ================================================ FILE: dev_tools/import_src ================================================ #!/usr/bin/env python """ Import Hadoop pipes/utils source code. NOTE: starting from cdh4.3, there is a single Hadoop tarball with both mr2 and mr1 code. The latter is located in: ${HADOOP_HOME}/src/hadoop-mapreduce1-project/. To fetch the code for mrv1, run import_src ${HADOOP_HOME}/src/hadoop-mapreduce1-project; to fetch the code for mrv2, run import_src ${HADOOP_HOME} --skip-dir hadoop-mapreduce1-project. """ import sys, os, argparse, warnings, shutil WANTED = { # basename: relative location "StringUtils.cc": "utils/impl", "SerialUtils.cc": "utils/impl", "StringUtils.hh": "utils/api/hadoop", "SerialUtils.hh": "utils/api/hadoop", "HadoopPipes.cc": "pipes/impl", "Pipes.hh": "pipes/api/hadoop", "TemplateFactory.hh": "pipes/api/hadoop", #--- libhdfs, all versions --- "hdfs.h": "libhdfs", "hdfs.c": "libhdfs", # --- libhdfs, old versions --- "hdfsJniHelper.h": "libhdfs", "hdfsJniHelper.c": "libhdfs", # --- libhdfs, recent versions --- "jni_helper.h": "libhdfs", "jni_helper.c": "libhdfs", "native_mini_dfs.h": "libhdfs", "native_mini_dfs.c": "libhdfs", "exception.h": "libhdfs", "exception.c": "libhdfs", # --- java pipes --- "Application.java": "org/apache/hadoop/mapred/pipes", "BinaryProtocol.java": "org/apache/hadoop/mapred/pipes", "DownwardProtocol.java": "org/apache/hadoop/mapred/pipes", "OutputHandler.java": "org/apache/hadoop/mapred/pipes", "PipesMapRunner.java": "org/apache/hadoop/mapred/pipes", "PipesNonJavaInputFormat.java": "org/apache/hadoop/mapred/pipes", "PipesPartitioner.java": "org/apache/hadoop/mapred/pipes", "PipesReducer.java": "org/apache/hadoop/mapred/pipes", "Submitter.java": "org/apache/hadoop/mapred/pipes", "UpwardProtocol.java": "org/apache/hadoop/mapred/pipes", "LocalJobRunner.java": "org/apache/hadoop/mapred", } def get_sources(root_dir, skip=None): sources = {} for d, _, basenames in os.walk(root_dir): if skip in d.split(os.sep): continue for bn in basenames: if bn in WANTED: if d.endswith(WANTED[bn]): sources[bn] = os.path.join(d, bn) missing = set(WANTED) - set(sources) if missing: warnings.warn("not found: %r" % (sorted(missing),)) return sources def make_parser(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('hadoop_home', metavar="HADOOP_HOME") parser.add_argument("-o", "--output-dir", metavar="DIR", help="output directory") parser.add_argument("-s", "--skip-dir", metavar="DIR", help="skip directories with this basename") return parser def main(argv): parser = make_parser() args = parser.parse_args(argv[1:]) if not args.output_dir: this_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(this_dir) args.output_dir = os.path.join( parent_dir, "src", os.path.basename(args.hadoop_home.rstrip("/")) ) if args.skip_dir: args.skip_dir = os.path.basename(args.skip_dir) sources = get_sources(args.hadoop_home, skip=args.skip_dir) for bn, p in sources.iteritems(): out_dir = os.path.join(args.output_dir, WANTED[bn]) try: os.makedirs(out_dir) except OSError: pass shutil.copy(p, out_dir) print "%s -> %s" % (p, out_dir) if __name__ == "__main__": main(sys.argv) ================================================ FILE: dev_tools/mapred_pipes ================================================ #!/usr/bin/env bash # Set up the layout needed to build the "mapred" version of pipes set -euo pipefail this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) if [ $# -lt 1 ]; then echo "Usage: $0 HADOOP_SRC" exit 1 fi if [ ! -d "${1}"/hadoop-mapreduce-project ]; then echo "ERROR: \"$1\" does not look like a Hadoop source dir" exit 1 fi hadoop_src=${1} pushd "${this_dir}/.." mapred_pipes_dir=src/it/crs4/pydoop/mapred/pipes rm -rf "${mapred_pipes_dir}" mkdir -p "${mapred_pipes_dir}" cp -rf "${hadoop_src}"/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/pipes/* "${mapred_pipes_dir}"/ sed -i 's/package org\.apache\.hadoop/package it\.crs4\.pydoop/g' "${mapred_pipes_dir}"/* # not exactly future-proof sed_cmd="s|self\.java_files = |self\.java_files = glob.glob(\"${mapred_pipes_dir}/*.java\") + |" sed -i "${sed_cmd}" setup.py popd ================================================ FILE: dev_tools/unpack_debian ================================================ #!/usr/bin/env python """ Unpack debian packages -- a quick shortcut for debug purposes. """ import sys, os, argparse, shutil, subprocess as sp THIS_DIR = os.path.dirname(os.path.abspath(__file__)) PARENT_DIR = os.path.dirname(THIS_DIR) DEFAULT_FROM_DIR = os.path.join(PARENT_DIR, "sandbox") DEFAULT_TO_DIR = os.path.join(PARENT_DIR, "temp") def get_pkg_map(from_dir): pkg_map = {} for fn in os.listdir(from_dir): if fn.endswith(".deb"): tag = fn.split("_", 1)[0] pkg_map[tag] = os.path.abspath(os.path.join(from_dir, fn)) return pkg_map def unpack(pkg_map, to_dir): if os.path.isdir(to_dir): shutil.rmtree(to_dir) os.makedirs(to_dir) for tag, fn in pkg_map.iteritems(): d = os.path.join(to_dir, tag) os.makedirs(d) old_wd = os.getcwd() os.chdir(d) print "unpacking %s to %s" % (fn, d) sp.check_call("ar x %s" % fn, shell=True) sp.check_call("tar xf data.tar.gz", shell=True) sp.check_call("tar xf control.tar.gz", shell=True) os.chdir(old_wd) def make_parser(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-i", "--input-dir", metavar="DIR", help="input directory", default=DEFAULT_FROM_DIR) parser.add_argument("-o", "--output-dir", metavar="DIR", help="output directory", default=DEFAULT_TO_DIR) return parser def main(argv): parser = make_parser() args = parser.parse_args(argv[1:]) pkg_map = get_pkg_map(args.input_dir) unpack(pkg_map, args.output_dir) if __name__ == "__main__": main(sys.argv) ================================================ FILE: dev_tools/update_docs ================================================ #!/bin/bash set -eu die() { echo "$1" 1>&2 exit 1 } DOCS_PREFIX="docs/_build/html" REPO="https://github.com/crs4/pydoop.git" [ -f "setup.py" ] || die "ERROR: run from the main repo dir" git subtree pull --prefix="${DOCS_PREFIX}" "${REPO}" gh-pages --squash make docs git add "${DOCS_PREFIX}" git commit -a -m "updated gh-pages" git subtree push --prefix="${DOCS_PREFIX}" "${REPO}" gh-pages --squash ================================================ FILE: docs/Makefile ================================================ # Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Pydoop.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Pydoop.qhc" latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ "run these through (pdf)latex." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." ================================================ FILE: docs/_build/.gitignore ================================================ * !.gitignore !html ================================================ FILE: docs/_templates/layout.html ================================================ {% extends "!layout.html" %} {%- macro mysidebar() %} {%- if not embedded %}{% if not theme_nosidebar|tobool %}
{%- block sidebarlogo %} {%- if logo %} {%- endif %} {%- endblock %} {%- block sidebartoc %} {%- if display_toc %}

{{ _('Table Of Contents') }}

{{ toc }} {%- endif %} {%- endblock %} {%- block sidebarrel %} {%- if prev %}

{{ _('Previous topic') }}

{{ prev.title }}

{%- endif %} {%- if next %}

{{ _('Next topic') }}

{{ next.title }}

{%- endif %} {%- endblock %} {%- block sidebarsourcelink %} {%- endblock %}

Get Pydoop

Contributors

Pydoop is developed by: CRS4

{%- if customsidebar %} {% include customsidebar %} {%- endif %} {%- block sidebarsearch %} {%- if pagename != "search" %} {%- endif %} {%- endblock %}
{%- endif %}{% endif %} {%- endmacro %} {% block rootrellink %}
  • Home
  • Installation
  • Support
  • Git Repo
  • Pydoop 1
  • {% endblock %} {# put the sidebar before the body #} {% block sidebar1 %} {{ mysidebar() }} {% endblock %} {% block sidebar2 %}{% endblock %} ================================================ FILE: docs/api_docs/hadut.rst ================================================ .. _hadut: :mod:`pydoop.hadut` --- Hadoop shell interaction ================================================ .. automodule:: pydoop.hadut :members: ================================================ FILE: docs/api_docs/hdfs_api.rst ================================================ .. _hdfs-api: :mod:`pydoop.hdfs` --- HDFS API =============================== .. automodule:: pydoop.hdfs :members: .. automodule:: pydoop.hdfs.path :members: .. automodule:: pydoop.hdfs.fs :members: .. automodule:: pydoop.hdfs.file :members: FileIO .. autoclass:: pydoop.hdfs.file.local_file ================================================ FILE: docs/api_docs/index.rst ================================================ .. _api-docs: API Docs ======== .. toctree:: mr_api hdfs_api hadut ================================================ FILE: docs/api_docs/mr_api.rst ================================================ .. _mr_api: :mod:`pydoop.mapreduce.api` --- MapReduce API ============================================= .. automodule:: pydoop.mapreduce.api :members: .. autofunction:: pydoop.mapreduce.pipes.run_task ================================================ FILE: docs/conf.py ================================================ # -*- coding: utf-8 -*- # # Pydoop documentation build configuration file, created by # sphinx-quickstart on Sun Jun 20 17:06:55 2010. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import datetime FIRST_RELEASE_YEAR = 2009 CURRENT_YEAR = datetime.datetime.now().year # No need to hack the path, we install before building docs # sys.path[1:1] = [ os.path.abspath('../pydoop') ] # -- General configuration ---------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.imgmath', 'sphinx.ext.ifconfig', 'sphinx.ext.intersphinx' ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. # source_encoding = 'utf-8' # The master toctree document. master_doc = 'index' # General information about the project. project = u'Pydoop' copyright = u'%d-%d, CRS4' % (FIRST_RELEASE_YEAR, CURRENT_YEAR) # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. with open("../VERSION") as f: version_string = f.read().strip() version = ".".join(version_string.split(".", 2)[:2]) # The full version, including alpha/beta/rc tags. release = version_string # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: # today = '' # Else, today_fmt is used as the format for a strftime call. # today_fmt = '%B %d, %Y' # Avoid doc-not-included-in-toctree warning exclude_patterns = [ 'pydoop_script_options.rst', # included with ..include:: 'pydoop_submit_options.rst', # included with ..include:: ] # List of directories, relative to source directory, that shouldn't be searched # for source files. exclude_trees = ['_build'] # The reST default role (used for this markup: `text`) to use for all # documents. # default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. # add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). # add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. # show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] # -- Options for HTML output -------------------------------------------------- # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. html_theme = 'sphinxdoc' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. # html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". # html_title = None # A shorter title for the navigation bar. Default is the same as html_title. # html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. html_logo = "_static/logo.png" # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. html_favicon = "_static/favicon.ico" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. # html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. # html_use_smartypants = True # Custom sidebar templates, maps document names to template names. # html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. # html_additional_pages = {} # If false, no module index is generated. # html_use_modindex = True # If false, no index is generated. # html_use_index = True # If true, the index is split into individual pages for each letter. # html_split_index = False # If true, links to the reST sources are added to the pages. # html_show_sourcelink = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. # html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). # html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = 'Pydoopdoc' # -- Options for LaTeX output ------------------------------------------------- # The paper size ('letter' or 'a4'). # latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). # latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass # [howto/manual]). latex_documents = [ ('index', 'Pydoop.tex', u'Pydoop Documentation', u'Simone Leo, Gianluigi Zanetti', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. # latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. # latex_use_parts = False # Additional stuff for the LaTeX preamble. # latex_preamble = '' # Documents to append as an appendix to all manuals. # latex_appendices = [] # If false, no module index is generated. # latex_use_modindex = True # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {'python': ('http://docs.python.org/2.7', None)} ================================================ FILE: docs/examples/avro.rst ================================================ .. _avro_io: Avro I/O ======== Pydoop transparently supports reading and writing `Avro `_ records in MapReduce applications. This must be enabled by setting appropriate options in ``pydoop submit`` (see below). The following program implements a (slightly modified) version of the color count example from the Avro docs: .. literalinclude:: ../../examples/avro/py/color_count.py :language: python :start-after: DOCS_INCLUDE_START The application counts the per-office occurrence of favorite colors in a dataset of user records with the following structure: .. literalinclude:: ../../examples/avro/schemas/user.avsc :language: javascript User records are read from an Avro container stored on HDFS, and results are written to another Avro container with the following schema: .. literalinclude:: ../../examples/avro/schemas/stats.avsc :language: javascript Pydoop transparently serializes and/or deserializes Avro data as needed, allowing you to work directly with Python dictionaries. To get this behavior, enable Avro I/O and specify the output schema as follows: .. code-block:: bash export STATS_SCHEMA=$(cat stats.avsc) pydoop submit \ -D pydoop.mapreduce.avro.value.output.schema="${STATS_SCHEMA}" \ --avro-input v --avro-output v \ --upload-file-to-cache color_count.py \ color_count input output The ``--avro-input v`` and ``--avro-output v`` flags specify that we want to work with Avro records on MapReduce values; the other possible choices are ``"k"``, where records are exchanged over keys, and ``"kv"``, which assumes that the top-level record structure has two fields named ``"key"`` and ``"value"`` and passes the former on keys and the latter on values. Note that we did not have to specify any input schema: in this case, Avro automatically falls back to the *writer schema*, i.e., the one that's been used to write the container file. The ``examples/avro`` directory contains examples for all I/O modes. Avro-Parquet I/O ---------------- The above example focuses on `Avro containers `_. However, Pydoop supports any input/output format that exchanges Avro records. In particular, it can be used to read from and write to Avro-Parquet files, i.e., `Parquet `_ files that use the Avro object model. .. note:: Make sure you have Parquet version 1.6 or later to avoid running into `object reuse problems `_. More generally, the record writer must be aware of the fact that records passed to its ``write`` method are mutable and can be reused by the caller. The following application reproduces the k-mer count example from the `ADAM `_ docs: .. literalinclude:: ../../examples/avro/py/kmer_count.py :language: python :start-after: DOCS_INCLUDE_START To run the above program, execute pydoop submit as follows: .. code-block:: bash export PROJECTION=$(cat projection.avsc) pydoop submit \ -D parquet.avro.projection="${PROJECTION}" \ --upload-file-to-cache kmer_count.py \ --input-format parquet.avro.AvroParquetInputFormat \ --avro-input v --libjars "path/to/the/parquet/jar" \ kmer_count input output Since we are using an external input format (Avro container input and output formats are integrated into the Java Pydoop code), we have to specify the corresponding class via ``--input-format`` and its jar with ``--libjars``. The optional parquet projection allows to extract only selected fields from the input data. Note that, in this case, reading input records from values is not an option: that's how ``AvroParquetInputFormat`` works. More Avro-Parquet examples are available under ``examples/avro``. Running the examples -------------------- To run the Avro examples you have to install the Python Avro package (you can get it from the Avro web site), while the ``avro`` jar is included in Hadoop and the ``avro-mapred`` one is included in Pydoop. Part of the examples code (e.g., input generation) is written in Java. Compilation and packaging into a jar is handled by the bash runners, but `Maven `_ needs to be installed on the client machine. ================================================ FILE: docs/examples/index.rst ================================================ .. _examples: Examples ======== .. toctree:: :maxdepth: 2 intro sequence_file input_format avro ================================================ FILE: docs/examples/input_format.rst ================================================ .. _input_format_example: Writing a Custom InputFormat ============================ You can use a custom Java ``InputFormat`` together with a Python :class:`~pydoop.mapreduce.api.RecordReader`: the java RecordReader supplied by the ``InputFormat`` will be overridden by the Python one. Consider the following simple modification of Hadoop's built-in ``TextInputFormat``: .. literalinclude:: ../../examples/input_format/it/crs4/pydoop/mapreduce/TextInputFormat.java :language: java :start-after: DOCS_INCLUDE_START With respect to the default one, this InputFormat adds a configurable boolean parameter (``pydoop.input.issplitable``) that, if set to ``false``, makes input files non-splitable (i.e., you can't get more input splits than the number of input files). For details on how to compile the above code into a jar and use it with Pydoop, see ``examples/input_format``\ . ================================================ FILE: docs/examples/intro.rst ================================================ Introduction ============ Pydoop includes several usage examples: you can find them in the "examples" subdirectory of the distribution root. Python Dependencies ------------------- If you've installed Pydoop or other Python packages needed by your application in a non-standard location (e.g., ``/opt/lib/python3.6/site-packages``), the Python code that runs within Hadoop tasks might not be able to find them. Note that, according to your Hadoop version or configuration, map and reduce tasks might run as a different user than the one who launched the job. If you can't install globally, Pydoop offers the option of shipping packages automatically upon job submission, see the section on :ref:`installation-free usage`. Input Data ---------- Most examples, by default, take their input from a free version of Lewis Carrol's "Alice's Adventures in Wonderland" available at `Project Gutenberg `_ (see the ``examples/input`` sub-directory). ================================================ FILE: docs/examples/sequence_file.rst ================================================ Using the Hadoop SequenceFile Format ==================================== Although many MapReduce applications deal with text files, there are many cases where processing binary data is required. In this case, you basically have two options: #. write appropriate :class:`~pydoop.mapreduce.api.RecordReader` / :class:`~pydoop.mapreduce.api.RecordWriter` classes for the binary format you need to process #. convert your data to Hadoop's standard ``SequenceFile`` format. To write sequence files with Pydoop, set the output format and the compression type as follows:: pydoop submit \ --output-format=org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat \ -D mapreduce.output.fileoutputformat.compress.type=NONE|RECORD|BLOCK [...] To read sequence files, set the input format as follows:: pydoop submit \ --input-format=org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat Example Application: Filter Wordcount Results --------------------------------------------- ``SequenceFile`` is mostly useful to handle complex objects like C-style structs or images. To keep our example as simple as possible, we considered a situation where a MapReduce task needs to emit the raw bytes of an integer value. We wrote a trivial application that reads input from a previous :ref:`word count ` run and filters out words whose count falls below a configurable threshold. Of course, the filter could have been directly applied to the wordcount reducer: the job has been artificially split into two runs to give a ``SequenceFile`` read / write example. Suppose you know in advance that most counts will be large, but not so large that they cannot fit in a 32-bit integer: since the decimal representation could require as much as 10 bytes, you decide to save space by having the wordcount reducer emit the raw four bytes of the integer instead: .. literalinclude:: ../../examples/sequence_file/bin/wordcount.py :language: python :pyobject: WordCountReducer Since newline characters can appear in the serialized values, you cannot use the standard text format where each line contains a tab-separated key-value pair. The problem can be solved by using ``SequenceFileOutputFormat`` for wordcount and ``SequenceFileInputFormat`` for the filtering application. The full source code for the example is available under ``examples/sequence_file``\ . ================================================ FILE: docs/how_to_cite.rst ================================================ How to Cite =========== Pydoop is developed and maintained by researchers at `CRS4 `_ -- Distributed Computing group. If you use Pydoop as part of your research work, please cite `the HPDC 2010 paper `_. **Plain text**:: S. Leo and G. Zanetti. Pydoop: a Python MapReduce and HDFS API for Hadoop. In Proceedings of the 19th ACM International Symposium on High Performance Distributed Computing, 819-825, 2010. **BibTeX**:: @inproceedings{Leo:2010:PPM:1851476.1851594, author = {Leo, Simone and Zanetti, Gianluigi}, title = {{Pydoop: a Python MapReduce and HDFS API for Hadoop}}, booktitle = {{Proceedings of the 19th ACM International Symposium on High Performance Distributed Computing}}, series = {HPDC '10}, year = {2010}, isbn = {978-1-60558-942-8}, location = {Chicago, Illinois}, pages = {819--825}, numpages = {7}, url = {http://doi.acm.org/10.1145/1851476.1851594}, doi = {10.1145/1851476.1851594}, acmid = {1851594}, publisher = {ACM}, address = {New York, NY, USA}, } ================================================ FILE: docs/index.rst ================================================ .. Pydoop documentation master file, created by sphinx-quickstart on Sun Jun 20 17:06:55 2010. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. **Pydoop** is a Python interface to `Hadoop `_ that allows you to write MapReduce applications in pure Python: .. literalinclude:: ../examples/pydoop_submit/mr/wordcount_minimal.py :language: python :pyobject: Mapper .. literalinclude:: ../examples/pydoop_submit/mr/wordcount_minimal.py :language: python :pyobject: Reducer Feature highlights: * a rich :ref:`HDFS API `; * a :ref:`MapReduce API ` that allows to write pure Python record readers / writers, partitioners and combiners; * transparent :ref:`Avro (de)serialization `. Pydoop enables MapReduce programming via a pure (except for a performance-critical serialization section) Python client for Hadoop Pipes, and HDFS access through an extension module based on `libhdfs `_. To get started, read the :ref:`tutorial `. Full docs, including :ref:`installation instructions `, are listed below. Contents ======== .. toctree:: :maxdepth: 2 news/index tutorial/index installation pydoop_script running_pydoop_applications api_docs/index examples/index self_contained how_to_cite Indices and Tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` ================================================ FILE: docs/installation.rst ================================================ .. _installation: Installation ============ Prerequisites ------------- We regularly test Pydoop on Ubuntu only, but it should also work on other Linux distros and (possibly with some tweaking) on macOS. Other platforms are **not** supported. Additional requirements: * `Python `_ 2 or 3, including header files (e.g., ``apt-get install python-dev``, ``yum install python-devel``); * `setuptools `_ >= 3.3; * Hadoop >=2. We run regular CI tests with recent versions of `Apache Hadoop `_ 2.x and 3.x, but we expect Pydoop to also work with other Hadoop distributions. In particular, we have tested it on `Amazon EMR `_ (see :ref:`emr`). These are both build time and run time requirements. At build time you will also need a C++ compiler (e.g., ``apt-get install build-essential``, ``yum install gcc gcc-c++``) and a JDK (a JRE is not sufficient). **Optional:** * `Avro `_ Python implementation to enable :ref:`avro_io` (run time only). Note that the pip packages for Python 2 and 3 are named differently (respectively ``avro`` and ``avro-python3``). Environment Setup ----------------- To compile the HDFS extension module, Pydoop needs the path to the JDK installation. You can specify this via ``JAVA_HOME``. For instance:: export JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64" Note that Pydoop is interested in the **JDK** home (where ``include/jni.h`` can be found), not the JRE home. Depending on your Java distribution and version, these can be different directories (usually the former being the latter's parent). If ``JAVA_HOME`` is not found in the environment, Pydoop will try to locate the JDK via Java system properties. Pydoop also includes some Java components, and it needs Hadoop libraries to be in the ``CLASSPATH`` in order to build them. This is done by calling ``hadoop classpath``, so make sure that the ``hadoop`` executable is in the ``PATH``. For instance, if Hadoop was installed by unpacking the tarball into ``/opt/hadoop``:: export PATH="/opt/hadoop/bin:/opt/hadoop/sbin:${PATH}" The Hadoop class path is also needed at run time by the HDFS extension. Again, since Pydoop picks it up from ``hadoop classpath``, ensure that ``hadoop`` is in the ``PATH``, as shown above. ``pydoop submit`` must also be able to call the ``hadoop`` executable. Additionally, Pydoop needs to read part of the Hadoop configuration to adapt to specific scenarios. If ``HADOOP_CONF_DIR`` is in the environment, Pydoop will try to read the configuration from the corresponding location. As a fallback, Pydoop will also try ``${HADOOP_HOME}/etc/hadoop`` (in the above example, ``HADOOP_HOME`` would be ``/opt/hadoop``). If ``HADOOP_HOME`` is not defined, Pydoop will try to guess it from the ``hadoop`` executable (again, this will have to be in the ``PATH``). Building and Installing ----------------------- Install prerequisites:: pip install --upgrade pip pip install --upgrade -r requirements.txt Install Pydoop via pip:: pip install pydoop To install a pre-release (e.g., alpha, beta) add ``--pre``:: pip install --pre pydoop You can also install the latest development version from GitHub:: git clone https://github.com/crs4/pydoop.git cd pydoop python setup.py build python setup.py install --skip-build If possible, you should install Pydoop on all cluster nodes. Alternatively, it can be distributed, together with your MapReduce applications, via the Hadoop distributed cache (see :doc:`self_contained`). Troubleshooting --------------- #. ``libjvm.so`` not found: try the following:: export LD_LIBRARY_PATH="${JAVA_HOME}/jre/lib/amd64/server:${LD_LIBRARY_PATH}" #. non-standard include/lib directories: the setup script looks for includes and libraries in standard places -- read ``setup.py`` for details. If some of the requirements are stored in different locations, you need to add them to the search path. Example:: python setup.py build_ext -L/my/lib/path -I/my/include/path -R/my/lib/path python setup.py build python setup.py install --skip-build Alternatively, you can write a small ``setup.cfg`` file for distutils: .. code-block:: cfg [build_ext] include_dirs=/my/include/path library_dirs=/my/lib/path rpath=%(library_dirs)s and then run ``python setup.py install``. Finally, you can achieve the same result by manipulating the environment. This is particularly useful in the case of automatic download and install with pip:: export CPATH="/my/include/path:${CPATH}" export LD_LIBRARY_PATH="/my/lib/path:${LD_LIBRARY_PATH}" pip install pydoop Testing your Installation ------------------------- After Pydoop has been successfully installed, you might want to run unit tests and/or examples to verify that everything works fine. Here is a short list of things that can go wrong and how to fix them. For full details on running tests and examples, see ``.travis.yml``. #. Incomplete configuration: make sure that Pydoop is able to find the ``hadoop`` executable and configuration directory (check the above section on environment setup). #. Cluster not ready: wait until all Hadoop daemons are up and HDFS exits from safe mode (``hadoop dfsadmin -safemode wait``). #. HDFS tests may fail if your NameNode's hostname and port are non-standard. In this case, set the ``HDFS_HOST`` and ``HDFS_PORT`` environment variables accordingly. #. Some HDFS tests may fail if not run by the cluster superuser, in particular ``capacity``, ``chown`` and ``used``. To get superuser privileges, you can either start the cluster with your own user account or set the ``dfs.permissions.superusergroup`` Hadoop property to one of your unix groups (type ``groups`` at the command prompt to get the list of groups for your current user), then restart the HDFS daemons. .. _emr: Using Pydoop on Amazon EMR -------------------------- You can configure your EMR cluster to automatically install Pydoop on all nodes via `Bootstrap Actions `_. The main difficulty is that Pydoop relies on Hadoop being installed and configured, even at compile time, so the bootstrap script needs to wait until EMR has finished setting it up: .. code-block:: bash #!/bin/bash PYDOOP_INSTALL_SCRIPT=$(cat </tmp/pydoop_install.out 2>/tmp/pydoop_install.err & The bootstrap script creates the actual installation script and calls it; the latter, in turn, waits for either the resource manager or the node manager to be up (i.e., for YARN to be up whether we are on the master or on a slave) before installing Pydoop. If you want to use Python 3, install version 3.6 with yum: .. code-block:: bash #!/bin/bash sudo yum -y install python36-devel python36-pip sudo alternatives --set python /usr/bin/python3.6 PYDOOP_INSTALL_SCRIPT=$(cat <`_ container. The Dockerfile is in the distribution root directory:: docker build -t pydoop . docker run --name pydoop -d pydoop This spins up a single-node, `pseudo-distributed `_ Hadoop cluster with `HDFS `_, `YARN `_ and a Job History server. Before attempting to use the container, wait a few seconds until all daemons are up and running. You may want to expose some ports to the host, such as the ones used by the web interfaces. For instance:: docker run --name pydoop -p 8088:8088 -p 9870:9870 -p 19888:19888 -d pydoop Refer to the Hadoop docs for a complete list of ports used by the various services. ================================================ FILE: docs/news/archive.rst ================================================ News Archive ------------ New in 1.2.0 ^^^^^^^^^^^^ * Added support for Hadoop 2.7.2. * Dropped support for Python 2.6. Maintaining 2.6 compatibility would require adding another dimension to the Travis matrix, vastly increasing the build time and ultimately slowing down the development. Since the default Python version in all major distributions is 2.7, the added effort would gain us little. * Bug fixes. New in 1.1.0 ^^^^^^^^^^^^ * Added support for `HDP `_ 2.2. * `Pyavroc `_ is now automatically loaded if installed, enabling much faster (30-40x) Avro (de)serialization. * Added Timer objects to help debug performance issues. * ``NoSeparatorTextOutputFormat`` is now available for all MR versions. * Added Avro support to the Hadoop Simulator. * Bug fixes and performance improvements. New in 1.0.0 ^^^^^^^^^^^^ * Pydoop now features a brand new, more pythonic :ref:`MapReduce API ` * Added built-in `Avro `_ support (for now, only with Hadoop 2). By setting a few flags in the submitter and selecting ``AvroContext`` as your application's context class, you can read and write Avro data, transparently manipulating records as Python dictionaries. See the :ref:`avro_io` docs for further details. * The new :ref:`pydoop submit ` tool drastically simplifies job submission, in particular when running applications without installing Pydoop and other dependencies on the cluster nodes (see :ref:`self_contained`). * Added support for testing Pydoop programs in a simulated Hadoop framework * Added support (experimental) for MapReduce V2 input/output formats (see :ref:`input_format_example`) * The :mod:`~pydoop.hdfs.path` module offers many new functions that serve as the HDFS-aware counterparts of those in :mod:`os.path` * The pipes backend (except for the performance-critical serialization section) has been reimplemented in pure Python * An alternative (optional) JPype HDFS backend is available (currently slower than the one based on libhdfs) * Added support for CDH5 and Apache Hadoop 2.4.1, 2.5.2 and 2.6.0 * Removed support for CDH3 and Apache Hadoop 0.20.2 * Installation has been greatly simplified: now Pydoop does not require any external library to build its native extensions New in 0.12.0 ^^^^^^^^^^^^^ * YARN is now fully supported * Added support for CDH 4.4.0 and CDH 4.5.0 New in 0.11.1 ^^^^^^^^^^^^^ * Added support for hadoop 2.2.0 * Added support for hadoop 1.2.1 New in 0.10.0 ^^^^^^^^^^^^^ * Added support for CDH 4.3.0 * Added a :meth:`~pydoop.hdfs.fs.hdfs.walk` method to hdfs instances (works similarly to :func:`os.walk` from Python's standard library) * The Hadoop version parser is now more flexible. It should be able to parse version strings for all CDH releases, including older ones (note that most of them are **not** supported) * Pydoop script can now handle modules whose file name has no extension * Fixed "unable to load native-hadoop library" problem (thanks to Liam Slusser) New in 0.9.0 ^^^^^^^^^^^^ * Added explicit support for: * Apache Hadoop 1.1.2 * CDH 4.2.0 * Added support for Cloudera from-parcels layout (as installed by Cloudera Manager) * Added :func:`pydoop.hdfs.move` * Record writers can now be used in map-only jobs New in 0.8.1 ^^^^^^^^^^^^ * Fixed a problem that was breaking installation from PyPI via pip install New in 0.8.0 ^^^^^^^^^^^^ * Added support for Apple OS X Mountain Lion * Added support for Hadoop 1.1.1 * Patches now include a fix for `HDFS-829 `_ * Restructured docs * A separate tutorial section collects and expands introductory material New in 0.7.0 ^^^^^^^^^^^^ * Added Debian package New in 0.7.0-rc3 ^^^^^^^^^^^^^^^^ * Fixed a bug in the hdfs instance caching method New in 0.7.0-rc2 ^^^^^^^^^^^^^^^^ * Support for HDFS append open mode * fails if your Hadoop version and/or configuration does not support HDFS append New in 0.7.0-rc1 ^^^^^^^^^^^^^^^^ * Works with CDH4, with the following limitations: * support for MapReduce v1 only * CDH4 must be installed from dist-specific packages (no tarball) * Tested with the latest releases of other Hadoop versions * Apache Hadoop 0.20.2, 1.0.4 * CDH 3u5, 4.1.2 * Simpler build process * the source code we need is now included, rather than searched for at compile time * Pydoop scripts can now accept user-defined configuration parameters * New examples show how to use the new feature * New wrapper object makes it easier to interact with the JobConf * New hdfs.path functions: isdir, isfile, kind * HDFS: support for string description of permission modes in chmod * Several bug fixes New in 0.6.6 ^^^^^^^^^^^^ Fixed a bug that was causing the pipes runner to incorrectly preprocess command line options. New in 0.6.4 ^^^^^^^^^^^^ Fixed several bugs triggered by using a local fs as the default fs for Hadoop. This happens when you set a ``file:`` path as the value of ``fs.defaultFS`` in core-site.xml. For instance: .. code-block:: xml fs.defaultFS file:///var/hadoop/data New in 0.6.0 ^^^^^^^^^^^^ * The HDFS API features new high-level tools for easier manipulation of files and directories. See the :ref:`API docs ` for more info * Examples have been thoroughly revised in order to make them easier to understand and run * Several bugs were fixed; we also introduced a few optimizations, most notably the automatic caching of HDFS instances New in 0.5.0 ^^^^^^^^^^^^ * Pydoop now works with Hadoop 1.0 * Multiple versions of Hadoop can now be supported by the same installation of Pydoop. * We have added a :ref:`command line tool ` to make it trivially simple to write shorts scripts for simple problems. * In order to work out-of-the-box, Pydoop now requires Pydoop 2.7. Python 2.6 can be used provided that you install a few additional modules (see the :ref:`installation ` page for details). * We have dropped support for the 0.21 branch of Hadoop, which has been marked as unstable and unsupported by Hadoop developers. ================================================ FILE: docs/news/index.rst ================================================ .. _news: News ==== .. toctree:: :maxdepth: 1 latest archive ================================================ FILE: docs/news/latest.rst ================================================ New in 2.0.0 ------------ Pydoop 2.0.0 adds Python 3 and Hadoop 3 support, and features a complete overhaul of the ``mapreduce`` subpackage, which is now easier to use and more efficient. As any major software release, Pydoop 2 also makes some backwards-incompatible changes, mainly by dropping old, seldom-used features. Finally, it includes several bug fixes and performance improvements. Here is a more detailed list of changes: * Python 3 support. * Hadoop 3 support. * The ``sercore`` extension, together with most of the ``pydoop.mapreduce`` subpackage, has been rewritten from scratch. Now it's simpler and slightly faster (much faster when using a combiner). * ``JobConf`` is now fully compatible with ``dict``. * ``pydoop submit`` now works when the default file system is local. * Compilation of avro-parquet-based examples is now much faster. * Many utilities for guessing Hadoop environment details have been either removed or drastically simplified (affects ``hadoop_utils`` and related package-level functions). Pydoop now assumes that the ``hadoop`` command is in the ``PATH``, and uses only that information to try fallback values when ``HADOOP_HOME`` and/or ``HADOOP_CONF_DIR`` are not defined. * The ``hadut`` module has been stripped down to contain little more than what's required by ``pydoop submit``. In particular, ``PipesRunner`` is gone. Running applications with ``mapred pipes`` still works, but with caveats (e.g., `it does not work on the local fs `_, and controlling remote task environment is not trivial). * The ``hdfs`` module no longer provides a default value for ``LIBHDFS_OPTS``. * The Hadoop simulator has been dropped. * `Support for opaque binary input splits `_. * `Dropped support for Hadoop 1 `_. * `Dropped old MapReduce API `_. * `Dropped JPype HDFS backend `_. * Bug fixes and performance improvements. ================================================ FILE: docs/pydoop_script.rst ================================================ .. _pydoop_script_guide: Pydoop Script User Guide ======================== Pydoop Script is the easiest way to write simple MapReduce programs for Hadoop. With Pydoop Script, you only need to write a map and/or a reduce functions and the system will take care of the rest. For a full explanation please see the :ref:`tutorial `. Command Line Tool ----------------- In the simplest case, Pydoop Script is invoked as:: pydoop script MODULE INPUT OUTPUT where ``MODULE`` is the file (on your local file system) containing your map and reduce functions, in Python, while ``INPUT`` and ``OUTPUT`` are, respectively, the HDFS paths of your input data and your job's output directory. Options are shown in the following table. .. include:: pydoop_script_options.rst Example: Word Count with Stop Words +++++++++++++++++++++++++++++++++++ Here is the word count example modified to ignore stop words from a file that is distributed to all the nodes via the Hadoop distributed cache: .. literalinclude:: ../examples/pydoop_script/scripts/wordcount_sw.py :language: python :start-after: DOCS_INCLUDE_START To execute the above script, save it to a ``wc.py`` file and run:: pydoop script wc.py hdfs_input hdfs_output --upload-file-to-cache stop_words.txt where ``stop_words.txt`` is a text file that contains the stop words, one per line. While this script works, it has the obvious weakness of loading the stop words list even when executing the reducer (since it's loaded as soon as we import the module). If this inconvenience is a concern, we could solve the issue by triggering the loading from the ``mapper`` function, or by writing a :ref:`full Pydoop application ` which would give us all the control we need to only load the list when required. Writing your Map and Reduce Functions ------------------------------------- In this section we assume you'll be using the default ``TextInputFormat`` and ``TextOutputFormat``. Mapper ++++++ The ``mapper`` function in your module will be called for each record in your input data. It receives 3 parameters: #. key: the byte offset with respect to the current input file. In most cases, you can ignore it; #. value: the line of text to be processed; #. writer object: a Python object to write output and count values (see below); #. optionally, a job conf object from which to fetch configuration property values (see `Accessing Parameters`_ below). Combiner ++++++++ The ``combiner`` function will be called for each unique key-value pair produced by your map function. It also receives 3 parameters: #. key: the key produced by your map function #. values iterable: iterate over this parameter to see all the values emitted for the current key #. writer object: a writer object identical to the one given to the map function #. optionally, a job conf object, identical to the one given to the map function. The key-value pair emitted by your combiner will be piped to the reducer. Reducer +++++++ The ``reducer`` function will be called for each unique key-value pair produced by your map function. It also receives 3 parameters: #. key: the key produced by your map function; #. values iterable: iterate over this parameter to traverse all the values emitted for the current key; #. writer object: this is identical to the one given to the map function; #. optionally, a job conf object, identical to the one given to the map function. The key-value pair emitted by your reducer will be joined by the key-value separator specified with the ``--kv-separator`` option (a tab character by default). Writer Object +++++++++++++ The writer object given as the third parameter to both the ``mapper`` and ``reducer`` functions has the following methods: * ``emit(k, v)``: pass a ``(k, v)`` key-value pair to the framework; * ``count(what, how_many)``: add ``how_many`` to the counter named ``what``. If the counter doesn't already exist, it will be created dynamically; * ``status(msg)``: update the task status to ``msg``; * ``progress()``: mark your task as having made progress without changing the status message. The latter two methods are useful for keeping your task alive in cases where the amount of computation to be done for a single record might exceed Hadoop's timeout interval (Hadoop kills a task if it neither reads an input, writes an output, nor updates its status for a configurable amount of time, set to 10 minutes by default). Accessing Parameters ++++++++++++++++++++ Pydoop Script lets you access the values of your job configuration properties through a dict-like :class:`~pydoop.mapreduce.api.JobConf` object, which gets passed as the fourth (optional) parameter to your functions. Naming your Functions +++++++++++++++++++++ If you'd like to give your map and reduce functions names different from ``mapper`` and ``reducer``, you may do so, but you must tell the script tool. Use the ``--map-fn`` and ``--reduce-fn`` command line arguments to select your customized names. Combiner functions can only be assigned by explicitly setting the ``--combine-fn`` flag. Map-only Jobs +++++++++++++ You may have a program that doesn't use a reduce function. Specify ``--num-reducers 0`` on the command line and your map output will be written directly to file. In this case, your map output will go directly to the output formatter and be written to your final output, separated by the key-value separator. ================================================ FILE: docs/pydoop_script_options.rst ================================================ .. Auto-generated by dev_tools/dump_app_params. DO NOT EDIT! To update, run: dev_tools/dump_app_params --app script -o docs/pydoop_script_options.rst +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | Short | Long | Meaning | +========+===============================+==========================================================================================================================================================+ | | ``--num-reducers`` | Number of reduce tasks. Specify 0 to only perform map phase | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--no-override-home`` | Don't set the script's HOME directory to the $HOME in your environment. Hadoop will set it to the value of the 'mapreduce.admin.user.home.dir' property | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--no-override-env`` | Use the default PATH, LD_LIBRARY_PATH and PYTHONPATH, instead of copying them from the submitting client node | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--no-override-ld-path`` | Use the default LD_LIBRARY_PATH instead of copying it from the submitting client node | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--no-override-pypath`` | Use the default PYTHONPATH instead of copying it from the submitting client node | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--no-override-path`` | Use the default PATH instead of copying it from the submitting client node | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--set-env`` | Set environment variables for the tasks. If a variable is set to '', it will not be overridden by Pydoop. | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``-D`` | ``--job-conf`` | Set a Hadoop property, e.g., -D mapreduce.job.priority=high | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--python-zip`` | Additional python zip file | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--upload-file-to-cache`` | Upload and add this file to the distributed cache. | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--upload-archive-to-cache`` | Upload and add this archive file to the distributed cache. | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--log-level`` | Logging level | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--job-name`` | name of the job | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--python-program`` | python executable that should be used by the wrapper | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--pretend`` | Do not actually submit a job, print the generated config settings and the command line that would be invoked | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--hadoop-conf`` | Hadoop configuration file | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--input-format`` | java classname of InputFormat | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``-m`` | ``--map-fn`` | name of map function within module | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``-r`` | ``--reduce-fn`` | name of reduce function within module | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``-c`` | ``--combine-fn`` | name of combine function within module | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--combiner-fn`` | --combine-fn alias for backwards compatibility | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``-t`` | ``--kv-separator`` | output key-value separator | +--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ ================================================ FILE: docs/pydoop_submit_options.rst ================================================ .. Auto-generated by dev_tools/dump_app_params. DO NOT EDIT! To update, run: dev_tools/dump_app_params --app submit -o docs/pydoop_submit_options.rst +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | Short | Long | Meaning | +========+========================================+==========================================================================================================================================================+ | | ``--num-reducers`` | Number of reduce tasks. Specify 0 to only perform map phase | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--no-override-home`` | Don't set the script's HOME directory to the $HOME in your environment. Hadoop will set it to the value of the 'mapreduce.admin.user.home.dir' property | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--no-override-env`` | Use the default PATH, LD_LIBRARY_PATH and PYTHONPATH, instead of copying them from the submitting client node | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--no-override-ld-path`` | Use the default LD_LIBRARY_PATH instead of copying it from the submitting client node | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--no-override-pypath`` | Use the default PYTHONPATH instead of copying it from the submitting client node | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--no-override-path`` | Use the default PATH instead of copying it from the submitting client node | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--set-env`` | Set environment variables for the tasks. If a variable is set to '', it will not be overridden by Pydoop. | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``-D`` | ``--job-conf`` | Set a Hadoop property, e.g., -D mapreduce.job.priority=high | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--python-zip`` | Additional python zip file | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--upload-file-to-cache`` | Upload and add this file to the distributed cache. | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--upload-archive-to-cache`` | Upload and add this archive file to the distributed cache. | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--log-level`` | Logging level | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--job-name`` | name of the job | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--python-program`` | python executable that should be used by the wrapper | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--pretend`` | Do not actually submit a job, print the generated config settings and the command line that would be invoked | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--hadoop-conf`` | Hadoop configuration file | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--input-format`` | java classname of InputFormat | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--disable-property-name-conversion`` | Do not adapt property names to the hadoop version used. | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--do-not-use-java-record-reader`` | Disable java RecordReader | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--do-not-use-java-record-writer`` | Disable java RecordWriter | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--output-format`` | java classname of OutputFormat | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--libjars`` | Additional comma-separated list of jar files | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--cache-file`` | Add this HDFS file to the distributed cache as a file. | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--cache-archive`` | Add this HDFS archive file to the distributed cacheas an archive. | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--entry-point`` | Explicitly execute MODULE.ENTRY_POINT() in the launcher script. | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--avro-input`` | Avro input mode (key, value or both) | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--avro-output`` | Avro output mode (key, value or both) | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--pstats-dir`` | Profile each task and store stats in this dir | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--pstats-fmt`` | pstats filename pattern (expert use only) | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--keep-wd`` | Don't remove the work dir | +--------+----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ ================================================ FILE: docs/running_pydoop_applications.rst ================================================ .. _running_apps: Pydoop Submit User Guide ======================== Pydoop applications are run via the ``pydoop submit`` command. To start, you will need a working Hadoop cluster. If you don't have one available, you can bring up a single-node Hadoop cluster on your machine -- see `the Hadoop web site `_ for instructions. Alternatively, the source directory contains a Dockerfile that can be used to build an image with Hadoop and Pydoop installed and (minimally) configured. Check out ``.travis.yml`` for usage hints. If your application is contained in a single (local) file named ``wc.py``, with an entry point called ``__main__`` (see :ref:`api_tutorial`) you can run it as follows:: pydoop submit --upload-file-to-cache wc.py wc input output where ``input`` (file or directory) and ``output`` (directory) are HDFS paths. Note that the ``output`` directory will not be overwritten: instead, an error will be generated if it already exists when you launch the program. If your entry point has a different name, specify it via ``--entry-point``. The following table shows command line options for ``pydoop submit``: .. include:: pydoop_submit_options.rst Setting the Environment for your Program ---------------------------------------- When working on a shared cluster where you don't have root access, you might have a lot of software installed in non-standard locations, such as your home directory. Since non-interactive ssh connections do not usually preserve your environment, you might lose some essential setting like ``LD_LIBRARY_PATH``\ . For this reason, by default ``pydoop submit`` copies some environment variables from the submitting node to the driver script that runs each task on Hadoop. If this behavior is not desired, you can disable it via the ``--no-override-env`` command line option. ================================================ FILE: docs/self_contained.rst ================================================ .. _self_contained: Installation-free Usage ======================= This example shows how to use the Hadoop Distributed Cache (DC) to distribute Python packages, possibly including Pydoop itself, to all cluster nodes at job launch time. This is useful in all cases where installing to each node is not feasible (e.g., lack of a shared mount point). Of course, Hadoop itself must be already installed and properly configured in all cluster nodes before you can run this. Source code for this example is available under ``examples/self_contained``\ . Example Application: Count Vowels --------------------------------- The example MapReduce application, ``vowelcount``, is rather trivial: it counts the occurrence of each vowel in the input text. Since the point here is to show how a structured package can be distributed and imported, the implementation is exceedingly verbose. .. literalinclude:: ../examples/self_contained/vowelcount/lib/__init__.py :language: python :start-after: DOCS_INCLUDE_START .. literalinclude:: ../examples/self_contained/vowelcount/mr/mapper.py :language: python :pyobject: Mapper .. literalinclude:: ../examples/self_contained/vowelcount/mr/reducer.py :language: python :pyobject: Reducer How it Works ------------ The DC supports automatic distribution of files and archives across the cluster at job launch time. This feature can be used to dispatch Python packages to all nodes, eliminating the need to install dependencies for your application, including Pydoop itself:: pydoop submit --upload-archive-to-cache vowelcount.tgz \ --upload-archive-to-cache pydoop.tgz [...] The ``pydoop.tgz`` and ``vowelcount.tgz`` archives will be copied to all slave nodes and unpacked; in addition, ``pydoop`` and ``vowelcount`` symlinks will be created in the current working directory of each task before it is executed. If you include in each archive the *contents* of the corresponding package, they will be available for import:: cd examples/self_contained/vowelcount tar cfz ../vowelcount.tgz . The archive must be in one of the formats supported by Hadoop: zip, tar or tgz. .. note:: Pydoop submit automatically builds the name of the symlink that points to the unpacked archive by stripping the last extension. Thus, ``foo.tar.gz`` will not work as expected, since the link will be called ``foo.tar``. Always use the ``.tgz`` extension in this case. The example is supposed to work with Pydoop and vowelcount *not* installed on the slave nodes (you do need Pydoop on the client machine used to run the example, however). ================================================ FILE: docs/tutorial/hdfs_api.rst ================================================ .. _hdfs_api_tutorial: The HDFS API ============ The :ref:`HDFS API ` allows you to connect to an HDFS installation, read and write files and get information on files, directories and global file system properties: .. literalinclude:: ../../examples/hdfs/repl_session.py :language: python :start-after: DOCS_INCLUDE_START :end-before: DOCS_INCLUDE_END Low-level API ------------- The high level API showcased above can be inefficient when performing multiple operations on the same HDFS instance. This is due to the fact that, under the hood, each function opens a separate connection to the HDFS server and closes it before returning. The following example shows how to build statistics of HDFS usage by block size by directly instantiating an ``hdfs`` object, which represents an open connection to an HDFS instance. Full source code for the example, including a script that can be used to generate an HDFS directory tree is located under ``examples/hdfs`` in the Pydoop distribution. .. literalinclude:: ../../examples/hdfs/treewalk.py :language: python :start-after: DOCS_INCLUDE_START For more information, see the :ref:`HDFS API reference `. ================================================ FILE: docs/tutorial/index.rst ================================================ .. _tutorial: Tutorial ======== .. toctree:: :maxdepth: 2 pydoop_script hdfs_api mapred_api ================================================ FILE: docs/tutorial/mapred_api.rst ================================================ .. _api_tutorial: Writing Full-Featured Applications ================================== While :ref:`Pydoop Script ` allows to solve many problems with minimal programming effort, some tasks require a broader set of features. If your data is not simple text with one record per line, for instance, you may need to write a record reader; if you need to change the way intermediate keys are assigned to reducers, you have to write your own partitioner. These components are accessible via the Pydoop MapReduce API. The rest of this section serves as an introduction to MapReduce programming with Pydoop; the :ref:`API reference ` has all the details. Mappers and Reducers -------------------- The Pydoop API is object-oriented: the application developer writes a :class:`~pydoop.mapreduce.api.Mapper` class, whose core job is performed by the :meth:`~pydoop.mapreduce.api.Mapper.map` method, and a :class:`~pydoop.mapreduce.api.Reducer` class that processes data via the :meth:`~pydoop.mapreduce.api.Reducer.reduce` method. The following snippet shows how to write the mapper and reducer for *wordcount*, an application that counts the occurrence of each word in a text data set: .. literalinclude:: ../../examples/pydoop_submit/mr/wordcount_minimal.py :language: python :start-after: DOCS_INCLUDE_START The mapper is instantiated by the MapReduce framework that, for each input record, calls the ``map`` method passing a ``context`` object to it. The context serves as a communication interface between the framework and the application: in the ``map`` method, it is used to get the current key (not used in the above example) and value, and to emit (send back to the framework) intermediate key-value pairs. The reducer works in a similar way, the main difference being the fact that the ``reduce`` method gets a set of values for each key. The context has several other functions that we will explore later. To run the above program, save it to a ``wc.py`` file and execute:: pydoop submit --upload-file-to-cache wc.py wc input output Where ``input`` is the HDFS input directory. See the section on :ref:`running Pydoop programs` for more details. Source code for the word count example is located under ``examples/pydoop_submit/mr`` in the Pydoop distribution. Counters and Status Updates --------------------------- Hadoop features application-wide counters that can be set and incremented by developers. Status updates are arbitrary text messages sent to the framework: these are especially useful in cases where the computation associated with a single input record can take a considerable amount of time, since Hadoop kills tasks that read no input, write no output and do not update the status within a configurable amount of time (ten minutes by default). The following snippet shows how to modify the above example to use counters and status updates: .. literalinclude:: ../../examples/pydoop_submit/mr/wordcount_full.py :language: python :pyobject: Mapper .. literalinclude:: ../../examples/pydoop_submit/mr/wordcount_full.py :language: python :pyobject: Reducer Counter values and status updates show up in Hadoop's web interface. In addition, the final values of all counters are listed in the command line output of the job (note that the list also includes Hadoop's default counters). Record Readers and Writers -------------------------- By default, Hadoop assumes you want to process plain text and splits input data into text lines. If you need to process binary data, or your text data is structured into records that span multiple lines, you need to write your own :class:`~pydoop.mapreduce.api.RecordReader`. The **record reader** operates at the HDFS file level: its job is to read data from the file and feed it as a stream of key-value pairs (records) to the mapper. To interact with HDFS files, we need to import the ``hdfs`` submodule: .. code-block:: python import pydoop.hdfs as hdfs The following example shows how to write a record reader that mimics Hadoop's default ``LineRecordReader``, where keys are byte offsets with respect to the whole file and values are text lines: .. literalinclude:: ../../examples/pydoop_submit/mr/wordcount_full.py :language: python :pyobject: Reader From the context, the record reader gets the following information on the byte chunk assigned to the current task, or **input split**: * the name of the file it belongs to; * its offset with respect to the beginning of the file; * its length. This allows to open the file, seek to the correct offset and read until the end of the split is reached. The framework gets the record stream by means of repeated calls to the :meth:`~pydoop.mapreduce.api.RecordReader.next` method. The :meth:`~pydoop.mapreduce.api.RecordReader.get_progress` method is called by the framework to get the fraction of the input split that's already been processed. The ``close`` method (present in all components except for the partitioner) is called by the framework once it has finished retrieving the records: this is the right place to perform cleanup tasks such as closing open handles. To use the reader, pass the class object to the factory with ``record_reader_class=Reader`` and, when running the program with ``pydoop submit``, set the ``--do-not-use-java-record-reader`` flag. The **record writer** writes key/value pairs to output files. The default behavior is to write one tab-separated key/value pair per line; if you want to do something different, you have to write a custom :class:`~pydoop.mapreduce.api.RecordWriter`: .. literalinclude:: ../../examples/pydoop_submit/mr/wordcount_full.py :language: python :pyobject: Writer The above example, which simply reproduces the default behavior, also shows how to get job configuration parameters: the one starting with ``mapreduce`` is a standard Hadoop parameter, while ``pydoop.hdfs.user`` is a custom parameter defined by the application developer. Configuration properties are passed as ``-D =`` (e.g., ``-D mapreduce.output.textoutputformat.separator='|'``) to the submitter. To use the writer, pass the class object to the factory with ``record_writer_class=Writer`` and, when running the program with ``pydoop submit``, set the ``--do-not-use-java-record-writer`` flag. Partitioners and Combiners -------------------------- The :class:`~pydoop.mapreduce.api.Partitioner` assigns intermediate keys to reducers. If you do *not* explicitly set a partitioner via the factory, partitioning will be done on the Java side. By default, Hadoop uses `HashPartitioner `_, which selects the reducer on the basis of a hash function of the key. To write a custom partitioner in Python, subclass :class:`~pydoop.mapreduce.api.Partitioner`, overriding the :meth:`~pydoop.mapreduce.api.Partitioner.partition` method. The framework will call this method with the current key and the total number of reducers ``N`` as the arguments, and expect the chosen reducer ID --- in the ``[0, ..., N-1]`` range --- as the return value. The following examples shows how to write a partitioner that simply mimics the default ``HashPartitioner`` behavior: .. literalinclude:: ../../examples/pydoop_submit/mr/wordcount_full.py :language: python :pyobject: Partitioner :prepend: from hashlib import md5 The combiner is functionally identical to a reducer, but it is run locally, on the key-value stream output by a single mapper. Although nothing prevents the combiner from processing values differently from the reducer, the former, provided that the reduce function is associative and idempotent, is typically configured to be the same as the latter, in order to perform local aggregation and thus help cut down network traffic. Local aggregation is implemented by caching intermediate key/value pairs in a dictionary. Like in standard Java Hadoop, cache size is controlled by ``mapreduce.task.io.sort.mb`` and defaults to 100 MB. Pydoop uses :func:`sys.getsizeof` to determine key/value size, which takes into account Python object overhead. This can be quite substantial (e.g., ``sys.getsizeof(b"foo") == 36``) and must be taken into account if fine tuning is desired. .. important:: Due to the caching, when using a combiner there are limitations on the types that can be used for intermediate keys and values. First of all, keys must be `hashable `_. In addition, values belonging to a mutable type should not change after having been emitted by the mapper. For instance, the following (however contrived) example would not work as expected: .. code-block:: python intermediate_value = {} class Mapper(api.Mapper): def map(self, ctx): intermediate_value.clear() intermediate_value[ctx.key] = ctx.value ctx.emit("foo", intermediate_value) For these reasons, it is recommended to use immutable types for both keys and values when the job includes a combiner. Custom partitioner and combiner classes must be declared to the factory as done above for record readers and writers. To recap, if we need to use all of the above components, we need to instantiate the factory as: .. literalinclude:: ../../examples/pydoop_submit/mr/wordcount_full.py :language: python :start-after: DOCS_INCLUDE_START :end-before: DOCS_INCLUDE_END Profiling Your Application -------------------------- Python has built-in support for application `profiling `_. Profiling a standalone program is relatively straightforward: run it through ``cProfile``, store stats in a file and use ``pstats`` to read and interpret them. A MapReduce job, however, spawns multiple map and reduce tasks, so we need a way to collect all stats. Pydoop supports this via a ``pstats_dir`` argument to ``run_task``: .. code-block:: python pipes.run_task(factory, pstats_dir="pstats") With the above call, Pydoop will run each MapReduce task with ``cProfile``, and store resulting pstats files in the ``"pstats"`` directory on HDFS. You can also enable profiling in the ``pydoop submit`` command line: .. code-block:: bash pydoop submit --pstats-dir HDFS_DIR [...] If the pstats directory is specified both ways, the one from ``run_task`` takes precedence. Another way to do time measurements is via counters. The ``utils.misc`` module provides a ``Timer`` object for this purpose: .. code-block:: python from pydoop.utils.misc import Timer class Mapper(api.Mapper): def __init__(self, context): super(Mapper, self).__init__(context) self.timer = Timer(context) def map(self, context): with self.timer.time_block("tokenize"): words = context.value.split() for w in words: context.emit(w, 1) With the above coding, the total time spent to execute ``context.value.split()`` (in ms) will be automatically accumulated in a ``TIME_TOKENIZE`` counter under the ``Timer`` counter group. Since profiling and timers can substantially slow down the Hadoop job, they should only be used for performance debugging. ================================================ FILE: docs/tutorial/pydoop_script.rst ================================================ .. _pydoop_script_tutorial: Easy Hadoop Scripting with Pydoop Script ======================================== Pydoop Script is the easiest way to write simple MapReduce programs for Hadoop. With Pydoop Script, your code focuses on the core of the MapReduce model: the mapper and reducer functions. Writing and Running Scripts --------------------------- Write a ``script.py`` Python module that contains the mapper and reducer functions: .. code-block:: python def mapper(input_key, input_value, writer): # your computation here writer.emit(intermediate_key, intermediate_value) def reducer(intermediate_key, value_iterator, writer): # your computation here writer.emit(output_key, output_value) The program can be run as follows:: pydoop script script.py hdfs_input hdfs_output Examples -------- The following examples show how to use Pydoop Script for common problems. More examples can be found in the ``examples/pydoop_script`` subdirectory of Pydoop's source distribution root. The :ref:`Pydoop Script Guide ` contains more detailed information on writing and running programs. .. _word_count: Word Count ++++++++++ Count the occurrence of each word in a set of text files. .. literalinclude:: ../../examples/pydoop_script/scripts/wordcount.py :language: python :start-after: DOCS_INCLUDE_START A few more lines allow to set a combiner for local aggregation: .. literalinclude:: ../../examples/pydoop_script/scripts/wc_combiner.py :language: python :start-after: DOCS_INCLUDE_START Run the example with:: pydoop script -c combiner wordcount.py hdfs_input hdfs_output Note that we need to explicitly set the ``-c`` flag to activate the combiner. By default, no combiner is called. One thing to remember is that the current Hadoop Pipes architecture runs the combiner under the hood of the executable run by ``pipes``, so it does not update the combiner counters of the general Hadoop framework. Thus, if you run the above script, you'll get a value of 0 for "Combine input/output records" in the "Map-Reduce Framework" group, but the "combiner calls" counter should be updated correctly. Map-only Jobs and Output Separators +++++++++++++++++++++++++++++++++++ Suppose we want to convert all input text to lower case. All we need to do is read each input line, convert it to lower case and emit it (for instance, as the output value). Since there is no aggregation involved, we don't need a reducer: .. literalinclude:: ../../examples/pydoop_script/scripts/lowercase.py :language: python :start-after: DOCS_INCLUDE_START The only problem with the above code is that, by default, each output key-value pair is written as tab-separated, which would lead to each output line having a leading tab character that's not found in the original input (note that we'd get a *trailing* tab if we emitted each record as the output key instead). We can turn off the reduce phase and get an empty separator for output key-value pairs by submitting the job with the following options:: pydoop script --num-reducers 0 -t '' lowercase.py hdfs_input hdfs_output Custom Parameters +++++++++++++++++ Suppose we want to select all lines containing a substring to be given at run time (distributed grep). As in the previous example, we can do this with a map-only job (read each input line and emit it if it contains the substring), but we need a way for the user of our application to specify the substring to be matched. This can be done by adding a fourth argument to the mapper function: .. literalinclude:: ../../examples/pydoop_script/scripts/grep.py :language: python :start-after: DOCS_INCLUDE_START In this case, Pydoop Script passes the Hadoop job configuration to the ``mapper`` function as a dictionary via the fourth argument. Moreover, just like Hadoop tools (e.g., ``hadoop pipes``), Pydoop Script allows to set additional configuration parameters via ``-D key=value``. To search for "hello", for instance, we can run the application as:: pydoop script --num-reducers 0 -t '' -D grep-expression=hello \ grep.py hdfs_input hdfs_output Applicability ------------- Pydoop Script makes it easy to solve simple problems. It makes it feasible to write simple (even throw-away) scripts to perform simple manipulations or analyses on your data, especially if it's text-based. If you can specify your algorithm in two simple functions that have no state or have a simple state that can be stored in module variables, then you can consider using Pydoop Script. If, on the other hand, you need more sophisticated processing, consider using the :ref:`full Pydoop API `. ================================================ FILE: examples/README ================================================ This directory contains several Pydoop usage examples. Documentation is in the "examples" subsection of the Pydoop html docs (look for the "docs" subdirectory in the distribution root). ================================================ FILE: examples/avro/build.sh ================================================ #!/usr/bin/env bash set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" . "${this_dir}/config.sh" pushd "${this_dir}" gen_classpath cp="$(<"${CP_PATH}"):$(${HADOOP} classpath)" mkdir -p "${CLASS_DIR}" javac -cp "${cp}" -d "${CLASS_DIR}" src/main/java/it/crs4/pydoop/* jar -cf "${JAR_PATH}" -C "${CLASS_DIR}" ./it popd ================================================ FILE: examples/avro/config.sh ================================================ [ -n "${PYDOOP_AVRO_EXAMPLES:-}" ] && return || readonly PYDOOP_AVRO_EXAMPLES=1 TARGET="target" export CLASS_DIR="${TARGET}/classes" export CP_PATH="${TARGET}/cp.txt" export JAR_PATH="${TARGET}/pydoop-avro-examples.jar" gen_classpath() { [ -f "${CP_PATH}" ] && return 0 mkdir -p "${TARGET}" mvn dependency:resolve mvn dependency:build-classpath -D mdep.outputFile="${CP_PATH}" echo -n ':'$(readlink -e ../../lib)/'*' >> "${CP_PATH}" } export -f gen_classpath ================================================ FILE: examples/avro/pom.xml ================================================ 4.0.0 it.crs4.pydoop pydoop-avro-examples jar 2.0a2 Pydoop Avro Examples https://crs4.github.io/pydoop/ 1.7.0 org.apache.parquet parquet-common ${parquet.version} org.apache.parquet parquet-column ${parquet.version} org.apache.parquet parquet-hadoop ${parquet.version} org.apache.parquet parquet-avro ${parquet.version} ================================================ FILE: examples/avro/py/avro_base.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import abc from collections import Counter import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pp class ColorPickBase(api.Mapper): @abc.abstractmethod def get_user(self, ctx): """ Get the user record. This is just to avoid writing near identical examples for the various key/value cases. In a real application, carrying records over keys or values would be a design decision, so you would simply do, e.g., ``user = self.value``. """ def map(self, ctx): user = self.get_user(ctx) color = user['favorite_color'] if color is not None: ctx.emit(user['office'], Counter({color: 1})) class AvroKeyColorPick(ColorPickBase): def get_user(self, ctx): return ctx.key class AvroValueColorPick(ColorPickBase): def get_user(self, ctx): return ctx.value class AvroKeyValueColorPick(ColorPickBase): def get_user(self, ctx): return ctx.key def map(self, ctx): sys.stdout.write("value (unused): %r\n" % (ctx.value,)) super(AvroKeyValueColorPick, self).map(ctx) class ColorCountBase(api.Reducer): def reduce(self, ctx): s = sum(ctx.values, Counter()) self.emit(s, ctx) @abc.abstractmethod def emit(self, s, ctx): """ Emit the sum to the ctx. As in the base mapper, this is just to avoid writing near identical examples. """ class NoAvroColorCount(ColorCountBase): def emit(self, s, ctx): ctx.emit(ctx.key, "%r" % s) class AvroKeyColorCount(ColorCountBase): def emit(self, s, ctx): ctx.emit({'office': ctx.key, 'counts': s}, ctx.key) class AvroValueColorCount(ColorCountBase): def emit(self, s, ctx): ctx.emit(ctx.key, {'office': ctx.key, 'counts': s}) class AvroKeyValueColorCount(ColorCountBase): def emit(self, s, ctx): record = {'office': ctx.key, 'counts': s} ctx.emit(record, record) # FIXME: do something fancier def run_task(mapper_class, reducer_class=NoAvroColorCount): pp.run_task(pp.Factory(mapper_class, reducer_class=reducer_class)) ================================================ FILE: examples/avro/py/avro_container_dump_results.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys from avro.io import DatumReader from avro.datafile import DataFileReader def main(fn, out_fn, avro_mode=''): with open(out_fn, 'w') as fo: with open(fn, 'rb') as f: reader = DataFileReader(f, DatumReader()) for r in reader: if avro_mode.upper() == 'KV': r = r['key'] fo.write('%s\t%r\n' % (r['office'], r['counts'])) print('wrote', out_fn) if __name__ == '__main__': main(*sys.argv[1:]) ================================================ FILE: examples/avro/py/avro_key_in.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from avro_base import AvroKeyColorPick, run_task def __main__(): run_task(AvroKeyColorPick) if __name__ == '__main__': __main__() ================================================ FILE: examples/avro/py/avro_key_in_out.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from avro_base import AvroKeyColorPick, AvroKeyColorCount, run_task def __main__(): run_task(AvroKeyColorPick, AvroKeyColorCount) if __name__ == '__main__': __main__() ================================================ FILE: examples/avro/py/avro_key_value_in.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from avro_base import AvroKeyValueColorPick, run_task def __main__(): run_task(AvroKeyValueColorPick) if __name__ == '__main__': __main__() ================================================ FILE: examples/avro/py/avro_key_value_in_out.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from avro_base import AvroKeyValueColorPick, AvroKeyValueColorCount, run_task def __main__(): run_task(AvroKeyValueColorPick, AvroKeyValueColorCount) if __name__ == '__main__': __main__() ================================================ FILE: examples/avro/py/avro_parquet_dump_results.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pp class Mapper(api.Mapper): def map(self, ctx): cc_stat = ctx.value ctx.emit(cc_stat['office'], repr(cc_stat['counts'])) def __main__(): pp.run_task(pp.Factory(Mapper)) ================================================ FILE: examples/avro/py/avro_pyrw.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Avro color count with Python record reader/writer. """ from collections import Counter import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pp from pydoop.avrolib import AvroReader, AvroWriter, parse class UserReader(AvroReader): pass class ColorWriter(AvroWriter): schema = parse(open("stats.avsc").read()) def emit(self, key, value): self.writer.append({'office': key, 'counts': value}) class ColorPick(api.Mapper): def map(self, ctx): user = ctx.value color = user['favorite_color'] if color is not None: ctx.emit(user['office'], Counter({color: 1})) class ColorCount(api.Reducer): def reduce(self, ctx): s = sum(ctx.values, Counter()) ctx.emit(ctx.key, s) pp.run_task(pp.Factory( mapper_class=ColorPick, reducer_class=ColorCount, record_reader_class=UserReader, record_writer_class=ColorWriter ), private_encoding=True) ================================================ FILE: examples/avro/py/avro_value_in.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from avro_base import AvroValueColorPick, run_task def __main__(): run_task(AvroValueColorPick) if __name__ == '__main__': __main__() ================================================ FILE: examples/avro/py/avro_value_in_out.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from avro_base import AvroValueColorPick, AvroValueColorCount, run_task def __main__(): run_task(AvroValueColorPick, AvroValueColorCount) if __name__ == '__main__': __main__() ================================================ FILE: examples/avro/py/check_cc.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import os import errno from collections import Counter from avro.io import DatumReader from avro.datafile import DataFileReader from pydoop.utils.py3compat import iteritems def iter_fnames(path): try: contents = os.listdir(path) except OSError as e: if e.errno == errno.ENOTDIR: yield path else: for name in contents: yield os.path.join(path, name) def main(in_, out_): expected = {} for in_fn in iter_fnames(in_): with open(in_fn, 'rb') as f: reader = DataFileReader(f, DatumReader()) for r in reader: expected.setdefault( r["office"], Counter() )[r["favorite_color"]] += 1 computed = {} for out_fn in iter_fnames(out_): with open(out_fn) as f: for l in f: p = l.strip().split('\t') computed[p[0]] = eval(p[1]) if set(computed) != set(expected): sys.exit("ERROR: computed keys != expected keys: %r != %r" % ( sorted(computed), sorted(expected))) for k, v in iteritems(expected): if computed[k] != v: sys.exit("ERROR: %r: %r != %r" % (k, computed[k], dict(v))) print('All is ok!') if __name__ == '__main__': main(sys.argv[1], sys.argv[2]) ================================================ FILE: examples/avro/py/check_results.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import os import errno from collections import Counter from pydoop.utils.py3compat import iteritems def iter_lines(path): try: contents = os.listdir(path) except OSError as e: if e.errno == errno.ENOTDIR: contents = [path] for name in contents: with open(os.path.join(path, name)) as f: for line in f: yield line def main(exp, res): expected = {} for l in iter_lines(exp): p = l.strip().split(';') expected.setdefault(p[1], Counter())[p[2]] += 1 computed = {} for l in iter_lines(res): p = l.strip().split('\t') computed[p[0]] = eval(p[1]) if set(computed) != set(expected): sys.exit("ERROR: computed keys != expected keys: %r != %r" % ( sorted(computed), sorted(expected))) for k, v in iteritems(expected): if computed[k] != v: sys.exit("ERROR: %r: %r != %r" % (k, computed[k], dict(v))) print('All is ok!') if __name__ == '__main__': main(sys.argv[1], sys.argv[2]) ================================================ FILE: examples/avro/py/color_count.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT # DOCS_INCLUDE_START from collections import Counter import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pp class Mapper(api.Mapper): def map(self, ctx): user = ctx.value color = user['favorite_color'] if color is not None: ctx.emit(user['office'], Counter({color: 1})) class Reducer(api.Reducer): def reduce(self, ctx): s = sum(ctx.values, Counter()) ctx.emit('', {'office': ctx.key, 'counts': s}) def __main__(): pp.run_task(pp.Factory(Mapper, reducer_class=Reducer)) ================================================ FILE: examples/avro/py/create_input.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import random import sys offices = ['office-%s' % i for i in range(3)] colors = ['red', 'blue', 'yellow', 'orange', 'maroon', 'green'] names = ['Alyssa', 'John', 'Kathy', 'Ben', 'Karla', 'Ross', 'Violetta'] def create_input(n, stream): for i in range(n): stream.write(';'.join([ random.choice(names), random.choice(offices), random.choice(colors), ]) + '\n') def main(n, filename): with open(filename, 'w') as f: create_input(n, f) if __name__ == '__main__': main(int(sys.argv[1]), sys.argv[2]) ================================================ FILE: examples/avro/py/gen_data.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import os import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pp class Mapper(api.Mapper): def map(self, ctx): name, length = ctx.value.split(None, 1) length = int(length) ctx.emit('', {'name': name, 'data': os.urandom(length)}) def __main__(): pp.run_task(pp.Factory(Mapper)) ================================================ FILE: examples/avro/py/generate_avro_users.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import random import avro.schema from avro.datafile import DataFileWriter from avro.io import DatumWriter if sys.version_info[0] == 3: xrange = range parse = avro.schema.Parse else: parse = avro.schema.parse NAME_POOL = ['george', 'john', 'paul', 'ringo'] OFFICE_POOL = ['office-%d' % _ for _ in xrange(4)] COLOR_POOL = ['black', 'cyan', 'magenta', 'yellow'] def main(argv): try: schema_fn = argv[1] n_users = int(argv[2]) avro_fn = argv[3] except IndexError: sys.exit('Usage: %s SCHEMA_FILE N_USERS AVRO_FILE' % argv[0]) with open(schema_fn) as f_in: schema = parse(f_in.read()) with open(avro_fn, 'wb') as f_out: writer = DataFileWriter(f_out, DatumWriter(), schema) for i in xrange(n_users): writer.append({ 'name': random.choice(NAME_POOL), 'office': random.choice(OFFICE_POOL), 'favorite_color': random.choice(COLOR_POOL), 'favorite_number': i, }) writer.close() if __name__ == '__main__': main(sys.argv) ================================================ FILE: examples/avro/py/kmer_count.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT # DOCS_INCLUDE_START import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pp WIDTH = 5 def window(s, width): for i in range(len(s) - width + 1): yield s[i: i + width] class Mapper(api.Mapper): def map(self, ctx): seq = ctx.value['sequence'] for kmer in window(seq, WIDTH): ctx.emit(kmer, 1) class Reducer(api.Reducer): def reduce(self, ctx): ctx.emit(ctx.key, sum(ctx.values)) def __main__(): pp.run_task(pp.Factory(Mapper, reducer_class=Reducer)) ================================================ FILE: examples/avro/py/show_kmer_count.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import csv from operator import itemgetter LIMIT = 10 def main(argv): with open(argv[1]) as f: reader = csv.reader(f, delimiter='\t') data = [(k, int(v)) for (k, v) in reader] data.sort(key=itemgetter(1), reverse=True) for i, t in enumerate(data): sys.stdout.write('%s\t%d\n' % t) if i + 1 >= LIMIT: break if __name__ == '__main__': main(sys.argv) ================================================ FILE: examples/avro/py/write_avro.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import csv import avro.schema from avro.datafile import DataFileWriter from avro.io import DatumWriter parse = avro.schema.Parse if sys.version_info[0] == 3 else avro.schema.parse FIELDS = ['name', 'office', 'favorite_color'] def main(schema_fn, csv_fn, avro_fn): with open(schema_fn) as f_in: schema = parse(f_in.read()) with open(csv_fn) as f_in: reader = csv.reader(f_in, delimiter=';') with open(avro_fn, 'wb') as f_out: writer = DataFileWriter(f_out, DatumWriter(), schema) for row in reader: writer.append(dict(zip(FIELDS, row))) writer.close() if __name__ == '__main__': try: schema_fn = sys.argv[1] csv_fn = sys.argv[2] avro_fn = sys.argv[3] except IndexError: sys.exit('Usage: %s SCHEMA_FILE CSV_FILE AVRO_FILE' % sys.argv[0]) main(schema_fn, csv_fn, avro_fn) ================================================ FILE: examples/avro/run ================================================ #!/usr/bin/env bash set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" # These examples could be adapted to also run on the local fs, but we have # enough coverage from the other ones. if [ "$(hadoop_fs)" = "file" ]; then echo "default file system is local, skipping all examples" exit 0 fi for io in "in" "in_out"; do for mode in "k" "v" "kv"; do "${this_dir}"/run_avro_container_${io} ${mode} done "${this_dir}"/run_avro_parquet_${io} done "${this_dir}"/run_avro_pyrw "${this_dir}"/run_color_count "${this_dir}"/run_kmer_count ================================================ FILE: examples/avro/run_avro_container_in ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" . "${this_dir}/config.sh" nargs=1 if [ $# -ne ${nargs} ]; then die "Usage: $0 k|v|kv" fi mode=$1 if [ "${mode}" == "k" ]; then MODULE=avro_key_in elif [ "${mode}" == "v" ]; then MODULE=avro_value_in elif [ "${mode}" == "kv" ]; then MODULE=avro_key_value_in else die "invalid mode: ${mode}" fi pushd "${this_dir}" USER_SCHEMA_FILE=schemas/user.avsc PET_SCHEMA_FILE=schemas/pet.avsc CSV_INPUT=$(mktemp -d) LOCAL_INPUT=$(mktemp -d) INPUT=$(basename ${LOCAL_INPUT}) OUTPUT=results # --- generate avro input --- N=20 for i in 1 2; do ${PYTHON} py/create_input.py ${N} "${CSV_INPUT}/users_${i}.csv" done if [ "${mode}" == "kv" ]; then for i in 1 2; do ./write_avro_kv "${USER_SCHEMA_FILE}" "${PET_SCHEMA_FILE}" \ "${CSV_INPUT}/users_${i}.csv" "${LOCAL_INPUT}/users_${i}.avro" done else for i in 1 2; do ${PYTHON} py/write_avro.py "${USER_SCHEMA_FILE}" \ "${CSV_INPUT}/users_${i}.csv" "${LOCAL_INPUT}/users_${i}.avro" done fi ${HADOOP} fs -mkdir -p /user/"${USER}" ${HADOOP} fs -rm "${INPUT}" || : ${HADOOP} fs -put "${LOCAL_INPUT}" "${INPUT}" # --- run cc --- MPY=py/"${MODULE}".py JOBNAME="${MODULE}"-job LOGLEVEL="DEBUG" ${HADOOP} fs -rm -r "/user/${USER}/${OUTPUT}" || : ${PYDOOP} submit \ --upload-file-to-cache py/avro_base.py \ --upload-file-to-cache "${MPY}" \ --num-reducers 1 \ --avro-input "${mode}" \ --log-level "${LOGLEVEL}" \ --job-name "${JOBNAME}" \ "${MODULE}" "${INPUT}" "${OUTPUT}" # --- check results --- rm -rf "${OUTPUT}" ${HADOOP} fs -get "${OUTPUT}" ${PYTHON} py/check_results.py "${CSV_INPUT}" "${OUTPUT}" rm -rf "${CSV_INPUT}" "${LOCAL_INPUT}" "${OUTPUT}" popd ================================================ FILE: examples/avro/run_avro_container_in_out ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" . "${this_dir}/config.sh" nargs=1 if [ $# -ne ${nargs} ]; then die "Usage: $0 k|v|kv" fi mode=$1 if [ "${mode}" == "k" ]; then MODULE=avro_key_in_out elif [ "${mode}" == "v" ]; then MODULE=avro_value_in_out elif [ "${mode}" == "kv" ]; then MODULE=avro_key_value_in_out else die "invalid mode: ${mode}" fi pushd "${this_dir}" USER_SCHEMA_FILE=schemas/user.avsc PET_SCHEMA_FILE=schemas/pet.avsc STATS_SCHEMA_FILE=schemas/stats.avsc STATS_SCHEMA=$(cat "${STATS_SCHEMA_FILE}") CSV_INPUT=$(mktemp -d) LOCAL_INPUT=$(mktemp -d) INPUT=$(basename ${LOCAL_INPUT}) OUTPUT=results # --- generate avro input --- N=20 for i in 1 2; do ${PYTHON} py/create_input.py ${N} "${CSV_INPUT}/users_${i}.csv" done if [ "${mode}" == "kv" ]; then for i in 1 2; do ./write_avro_kv "${USER_SCHEMA_FILE}" "${PET_SCHEMA_FILE}" \ "${CSV_INPUT}/users_${i}.csv" "${LOCAL_INPUT}/users_${i}.avro" done else for i in 1 2; do ${PYTHON} py/write_avro.py "${USER_SCHEMA_FILE}" \ "${CSV_INPUT}/users_${i}.csv" "${LOCAL_INPUT}/users_${i}.avro" done fi ${HADOOP} fs -mkdir -p /user/"${USER}" ${HADOOP} fs -rm -r "${INPUT}" || : ${HADOOP} fs -put "${LOCAL_INPUT}" "${INPUT}" # --- run cc --- MPY=py/"${MODULE}".py JOBNAME="${MODULE}"-job LOGLEVEL="DEBUG" # put the following opts at the end of the command line # or the empty string will be parsed as the module arg if [ "${mode}" == "k" ]; then K_SCHEMA_OPT="-D pydoop.mapreduce.avro.key.output.schema=${STATS_SCHEMA}" V_SCHEMA_OPT="" elif [ "${mode}" == "v" ]; then K_SCHEMA_OPT="" V_SCHEMA_OPT="-D pydoop.mapreduce.avro.value.output.schema=${STATS_SCHEMA}" else K_SCHEMA_OPT="-D pydoop.mapreduce.avro.key.output.schema=${STATS_SCHEMA}" V_SCHEMA_OPT="-D pydoop.mapreduce.avro.value.output.schema=${STATS_SCHEMA}" fi ${HADOOP} fs -rm -r "/user/${USER}/${OUTPUT}" || : ${PYDOOP} submit \ --upload-file-to-cache py/avro_base.py \ --upload-file-to-cache "${MPY}" \ --num-reducers 1 \ --avro-input "${mode}" \ --avro-output "${mode}" \ --log-level "${LOGLEVEL}" \ --job-name "${JOBNAME}" \ "${MODULE}" "${INPUT}" "${OUTPUT}" \ "${K_SCHEMA_OPT}" "${V_SCHEMA_OPT}" # --- dump & check results --- DUMP_DIR=$(mktemp -d) rm -rf "${OUTPUT}" ${HADOOP} fs -get "${OUTPUT}" for f in "${OUTPUT}"/part*; do ${PYTHON} py/avro_container_dump_results.py \ "${f}" "${DUMP_DIR}"/$(basename ${f}).tsv "${mode}" done ${PYTHON} py/check_results.py "${CSV_INPUT}" "${DUMP_DIR}" rm -rf "${CSV_INPUT}" "${LOCAL_INPUT}" "${OUTPUT}" "${DUMP_DIR}" popd ================================================ FILE: examples/avro/run_avro_parquet_in ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" . "${this_dir}/config.sh" pushd "${this_dir}" [ -f "${JAR_PATH}" ] || ./build.sh SCHEMA_FILE_LOCAL=schemas/user.avsc SCHEMA_FILE_HDFS=user.avsc # --- create input --- CSV_INPUT=$(mktemp -d) INPUT=$(basename ${CSV_INPUT}) PARQUETS_DIR=parquets N=20 for i in 1 2; do ${PYTHON} py/create_input.py ${N} "${CSV_INPUT}/users_${i}.csv" done # --- convert to avro-parquet --- ${HADOOP} fs -mkdir -p /user/"${USER}" ${HADOOP} fs -rm -r /user/"${USER}"/"${PARQUETS_DIR}" || : ${HADOOP} fs -rm -r "${INPUT}" || : ${HADOOP} fs -put "${CSV_INPUT}" "${INPUT}" ${HADOOP} fs -put -f "${SCHEMA_FILE_LOCAL}" "${SCHEMA_FILE_HDFS}" export HADOOP_CLASSPATH=$(<"${CP_PATH}") ${HADOOP} jar "${JAR_PATH}" it.crs4.pydoop.WriteParquet \ -libjars="${HADOOP_CLASSPATH//:/,}" \ "${INPUT}" "${PARQUETS_DIR}" "${SCHEMA_FILE_HDFS}" # --- run color count --- MODULE=avro_value_in MPY=py/"${MODULE}".py JOBNAME="${MODULE}"-job LOGLEVEL="DEBUG" USER_SCHEMA=$(cat "${SCHEMA_FILE_LOCAL}") INPUT_FORMAT=org.apache.parquet.avro.AvroParquetInputFormat INPUT="${PARQUETS_DIR}" OUTPUT=results ${HADOOP} fs -rm -r /user/"${USER}"/"${OUTPUT}" || : ${PYDOOP} submit --upload-file-to-cache "${MPY}" \ --upload-file-to-cache py/avro_base.py \ --num-reducers 1 \ --input-format "${INPUT_FORMAT}" \ --avro-input v \ --libjars "${JAR_PATH},${HADOOP_CLASSPATH//:/,}" \ --log-level "${LOGLEVEL}" \ --job-name "${JOBNAME}" \ "${MODULE}" "${PARQUETS_DIR}" "${OUTPUT}" # --- check results --- rm -rf "${OUTPUT}" ${HADOOP} fs -get /user/"${USER}"/"${OUTPUT}" ${PYTHON} py/check_results.py "${CSV_INPUT}" "${OUTPUT}" rm -rf "${CSV_INPUT}" "${OUTPUT}" popd ================================================ FILE: examples/avro/run_avro_parquet_in_out ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" . "${this_dir}/config.sh" pushd "${this_dir}" [ -f "${JAR_PATH}" ] || ./build.sh IN_SCHEMA_FILE_LOCAL=schemas/user.avsc IN_SCHEMA_FILE_HDFS=user.avsc OUT_SCHEMA_FILE_LOCAL=schemas/stats.avsc OUT_SCHEMA_FILE_HDFS=stats.avsc # --- create input --- CSV_INPUT=$(mktemp -d) INPUT=$(basename ${CSV_INPUT}) PARQUETS_DIR=parquets N=20 for i in 1 2; do ${PYTHON} py/create_input.py ${N} "${CSV_INPUT}/users_${i}.csv" done # --- convert to avro-parquet --- ${HADOOP} fs -mkdir -p /user/"${USER}" ${HADOOP} fs -rm -r /user/"${USER}"/"${PARQUETS_DIR}" || : ${HADOOP} fs -rm -r "${INPUT}" || : ${HADOOP} fs -put "${CSV_INPUT}" "${INPUT}" ${HADOOP} fs -put -f "${IN_SCHEMA_FILE_LOCAL}" "${IN_SCHEMA_FILE_HDFS}" export HADOOP_CLASSPATH=$(<"${CP_PATH}") hadoop jar "${JAR_PATH}" it.crs4.pydoop.WriteParquet \ -libjars="${HADOOP_CLASSPATH//:/,}" \ "${INPUT}" "${PARQUETS_DIR}" "${IN_SCHEMA_FILE_HDFS}" # --- run color count --- MODULE=avro_value_in_out MPY=py/"${MODULE}".py JOBNAME="${MODULE}"-job LOGLEVEL="DEBUG" STATS_SCHEMA=$(cat "${OUT_SCHEMA_FILE_LOCAL}") INPUT_FORMAT=org.apache.parquet.avro.AvroParquetInputFormat OUTPUT_FORMAT=org.apache.parquet.avro.AvroParquetOutputFormat CC_OUTPUT=cc_output ${HADOOP} fs -rm -r /user/"${USER}"/"${CC_OUTPUT}" || : ${PYDOOP} submit \ -D pydoop.mapreduce.avro.value.output.schema="${STATS_SCHEMA}" \ -D parquet.avro.schema="${STATS_SCHEMA}" \ --upload-file-to-cache py/avro_base.py \ --upload-file-to-cache "${MPY}" \ --num-reducers 1 \ --input-format "${INPUT_FORMAT}" \ --output-format "${OUTPUT_FORMAT}" \ --avro-input v \ --avro-output v \ --libjars "${JAR_PATH},${HADOOP_CLASSPATH//:/,}" \ --log-level "${LOGLEVEL}" \ --job-name "${JOBNAME}" \ "${MODULE}" "${PARQUETS_DIR}" "${CC_OUTPUT}" # --- dump results --- MODULE=avro_parquet_dump_results MPY=py/"${MODULE}".py JOBNAME="${MODULE}"-job LOGLEVEL="DEBUG" STATS_SCHEMA=$(cat "${OUT_SCHEMA_FILE_LOCAL}") INPUT_FORMAT=org.apache.parquet.avro.AvroParquetInputFormat OUTPUT=results ${HADOOP} fs -rm -r /user/"${USER}"/"${OUTPUT}" || : ${PYDOOP} submit \ --upload-file-to-cache "${MPY}" \ --num-reducers 0 \ --input-format "${INPUT_FORMAT}" \ --avro-input v \ --libjars "${JAR_PATH},${HADOOP_CLASSPATH//:/,}" \ --log-level "${LOGLEVEL}" \ --job-name "${JOBNAME}" \ "${MODULE}" "${CC_OUTPUT}" "${OUTPUT}" # --- check results --- rm -rf "${OUTPUT}" ${HADOOP} fs -get /user/"${USER}"/"${OUTPUT}" ${PYTHON} py/check_results.py "${CSV_INPUT}" "${OUTPUT}" rm -rf "${CSV_INPUT}" "${OUTPUT}" popd ================================================ FILE: examples/avro/run_avro_pyrw ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" pushd "${this_dir}" USER_SCHEMA_FILE=schemas/user.avsc STATS_SCHEMA_FILE=schemas/stats.avsc CSV_INPUT=$(mktemp -d) LOCAL_INPUT=$(mktemp -d) INPUT=$(basename ${LOCAL_INPUT}) OUTPUT=results # --- generate avro input --- N=20 for i in 1 2; do ${PYTHON} py/create_input.py ${N} "${CSV_INPUT}/users_${i}.csv" ${PYTHON} py/write_avro.py "${USER_SCHEMA_FILE}" \ "${CSV_INPUT}/users_${i}.csv" "${LOCAL_INPUT}/users_${i}.avro" done # --- run cc --- MODULE=avro_pyrw MPY=py/"${MODULE}".py JOBNAME="${MODULE}"-job LOGLEVEL="DEBUG" ${HADOOP} fs -mkdir -p /user/"${USER}" ${HADOOP} fs -rm "${INPUT}" || : ${HADOOP} fs -put "${LOCAL_INPUT}" "${INPUT}" ${HADOOP} fs -rm -r "/user/${USER}/${OUTPUT}" || : ${PYDOOP} submit \ --upload-file-to-cache "${MPY}" \ --upload-file-to-cache "${USER_SCHEMA_FILE}" \ --upload-file-to-cache "${STATS_SCHEMA_FILE}" \ --num-reducers 1 \ --do-not-use-java-record-reader \ --do-not-use-java-record-writer \ --log-level "${LOGLEVEL}" \ --job-name "${JOBNAME}" \ "${MODULE}" "${INPUT}" "${OUTPUT}" # --- dump & check results --- DUMP_DIR=$(mktemp -d) rm -rf "${OUTPUT}" ${HADOOP} fs -get "${OUTPUT}" for f in "${OUTPUT}"/part*; do ${PYTHON} py/avro_container_dump_results.py \ "${f}" "${DUMP_DIR}"/$(basename ${f}).tsv done ${PYTHON} py/check_results.py "${CSV_INPUT}" "${DUMP_DIR}" rm -rf "${CSV_INPUT}" "${LOCAL_INPUT}" "${OUTPUT}" "${DUMP_DIR}" popd ================================================ FILE: examples/avro/run_color_count ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" pushd "${this_dir}" MODULE="color_count" USER_SCHEMA_FILE=schemas/user.avsc STATS_SCHEMA_FILE=schemas/stats.avsc STATS_SCHEMA=$(cat "${STATS_SCHEMA_FILE}") LOCAL_INPUT=$(mktemp -d) INPUT=$(basename ${LOCAL_INPUT}) OUTPUT=results # --- generate avro input --- N=20 for i in 1 2; do ${PYTHON} py/generate_avro_users.py "${USER_SCHEMA_FILE}" ${N} \ "${LOCAL_INPUT}/users_${i}.avro" done ${HADOOP} fs -mkdir -p /user/"${USER}" ${HADOOP} fs -rm -r "${INPUT}" || : ${HADOOP} fs -put "${LOCAL_INPUT}" "${INPUT}" # --- run cc --- MPY=py/"${MODULE}".py JOBNAME="${MODULE}"-job LOGLEVEL="DEBUG" ${HADOOP} fs -rm -r "/user/${USER}/${OUTPUT}" || : ${PYDOOP} submit \ -D pydoop.mapreduce.avro.value.output.schema="${STATS_SCHEMA}" \ --upload-file-to-cache "${MPY}" \ --num-reducers 1 \ --avro-input v \ --avro-output v \ --log-level "${LOGLEVEL}" \ --job-name "${JOBNAME}" \ "${MODULE}" "${INPUT}" "${OUTPUT}" # --- dump & check results --- DUMP_DIR=$(mktemp -d) rm -rf "${OUTPUT}" ${HADOOP} fs -get "${OUTPUT}" for f in "${OUTPUT}"/part*; do ${PYTHON} py/avro_container_dump_results.py \ "${f}" "${DUMP_DIR}"/$(basename ${f}).tsv done ${PYTHON} py/check_cc.py "${LOCAL_INPUT}" "${DUMP_DIR}" rm -rf "${LOCAL_INPUT}" "${OUTPUT}" "${DUMP_DIR}" popd ================================================ FILE: examples/avro/run_kmer_count ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" . "${this_dir}/config.sh" pushd "${this_dir}" [ -f "${JAR_PATH}" ] || ./build.sh MODULE=kmer_count MPY=py/"${MODULE}".py JOBNAME="${MODULE}"-job LOGLEVEL="DEBUG" INPUT_FORMAT=org.apache.parquet.avro.AvroParquetInputFormat PROJECTION=$(cat schemas/alignment_record_proj.avsc) LOCAL_INPUT=$(mktemp -d) INPUT=$(basename ${LOCAL_INPUT}) OUTPUT=kmer_count for i in 1 2; do cp data/mini_aligned_seqs.gz.parquet ${LOCAL_INPUT}/seqs_${i}.gz.parquet done ${HADOOP} fs -mkdir -p /user/"${USER}" ${HADOOP} fs -rm -r "${INPUT}" || : ${HADOOP} fs -put "${LOCAL_INPUT}" "${INPUT}" ${HADOOP} fs -rm -r /user/"${USER}"/"${OUTPUT}" || : HADOOP_CLASSPATH=$(<"${CP_PATH}") ${PYDOOP} submit \ -D parquet.avro.projection="${PROJECTION}" \ --upload-file-to-cache "${MPY}" \ --num-reducers 1 \ --input-format "${INPUT_FORMAT}" \ --avro-input v \ --libjars "${JAR_PATH},${HADOOP_CLASSPATH//:/,}" \ --log-level "${LOGLEVEL}" \ --job-name "${JOBNAME}" \ "${MODULE}" "${INPUT}" "${OUTPUT}" rm -rf "${OUTPUT}" ${HADOOP} fs -get /user/"${USER}"/"${OUTPUT}" ${PYTHON} py/show_kmer_count.py "${OUTPUT}"/part-r-00000 rm -rf "${OUTPUT}" "${LOCAL_INPUT}" popd ================================================ FILE: examples/avro/schemas/alignment_record.avsc ================================================ { "type": "record", "name": "AlignmentRecord", "fields": [ { "default": null, "doc": "The reference sequence details for the reference chromosome that\n this read is aligned to. If the read is unaligned, this field should\n be null.", "name": "contig", "type": [ "null", { "type": "record", "name": "Contig", "doc": "Record for describing a reference assembly. Not used for storing the contents\n of said assembly.\n\n @see NucleotideContigFragment", "fields": [ { "default": null, "doc": "The name of this contig in the assembly (e.g., \"chr1\").", "name": "contigName", "type": ["null", "string"] }, { "default": null, "doc": "The length of this contig.", "name": "contigLength", "type": ["null", "long"] }, { "default": null, "doc": "The MD5 checksum of the assembly for this contig.", "name": "contigMD5", "type": ["null", "string"] }, { "default": null, "doc": "The URL at which this reference assembly can be found.", "name": "referenceURL", "type": ["null", "string"] }, { "default": null, "doc": "The name of this assembly (e.g., \"hg19\").", "name": "assembly", "type": ["null", "string"] }, { "default": null, "doc": "The species that this assembly is for.", "name": "species", "type": ["null", "string"] } ] } ] }, { "default": null, "doc": "0 based reference position for the start of this read's alignment.\n Should be null if the read is unaligned.", "name": "start", "type": ["null", "long"] }, { "default": null, "doc": "0 based reference position where this read used to start before\n local realignment.\n Stores the same data as the OP field in the SAM format.", "name": "oldPosition", "type": ["null", "long"] }, { "default": null, "doc": "0 based reference position for the end of this read's alignment.\n Should be null if the read is unaligned.", "name": "end", "type": ["null", "long"] }, { "default": null, "doc": "The global mapping quality of this read.", "name": "mapq", "type": ["null", "int"] }, { "default": null, "doc": "The name of this read. This should be unique within the read group\n that this read is from, and can be used to identify other reads that\n are derived from a single fragment.", "name": "readName", "type": ["null", "string"] }, { "default": null, "doc": "The bases in this alignment. If the read has been hard clipped, this may\n not represent all the bases in the original read.", "name": "sequence", "type": ["null", "string"] }, { "default": null, "doc": "The per-base quality scores in this alignment. If the read has been hard\n clipped, this may not represent all the bases in the original read.\n Additionally, if the error scores have been recalibrated, this field\n will not contain the original base quality scores.\n\n @see origQual", "name": "qual", "type": ["null", "string"] }, { "default": null, "doc": "The Compact Ideosyncratic Gapped Alignment Report (CIGAR) string that\n describes the local alignment of this read. Contains {length, operator}\n pairs for all contiguous alignment operations. The operators include:\n \n * M, ALIGNMENT_MATCH: An alignment match indicates that a sequence can be\n aligned to the reference without evidence of an INDEL. Unlike the\n SEQUENCE_MATCH and SEQUENCE_MISMATCH operators, the ALIGNMENT_MATCH\n operator does not indicate whether the reference and read sequences are an\n exact match.\n * I, INSERT: The insert operator indicates that the read contains evidence of\n bases being inserted into the reference.\n * D, DELETE: The delete operator indicates that the read contains evidence of\n bases being deleted from the reference.\n * N, SKIP: The skip operator indicates that this read skips a long segment of\n the reference, but the bases have not been deleted. This operator is\n commonly used when working with RNA-seq data, where reads may skip long\n segments of the reference between exons.\n * S, CLIP_SOFT: The soft clip operator indicates that bases at the start/end\n of a read have not been considered during alignment. This may occur if the\n majority of a read maps, except for low quality bases at the start/end of\n a read. Bases that are soft clipped will still be stored in the read.\n * H, CLIP_HARD: The hard clip operator indicates that bases at the start/end of\n a read have been omitted from this alignment. This may occur if this linear\n alignment is part of a chimeric alignment, or if the read has been trimmed\n (e.g., during error correction, or to trim poly-A tails for RNA-seq).\n * P, PAD: The pad operator indicates that there is padding in an alignment.\n * =, SEQUENCE_MATCH: This operator indicates that this portion of the aligned\n sequence exactly matches the reference (e.g., all bases are equal to the\n reference bases).\n * X, SEQUENCE_MISMATCH: This operator indicates that this portion of the \n aligned sequence is an alignment match to the reference, but a sequence\n mismatch (e.g., the bases are not equal to the reference). This can\n indicate a SNP or a read error.", "name": "cigar", "type": ["null", "string"] }, { "default": null, "doc": "Stores the CIGAR string present before local indel realignment.\n Stores the same data as the OC field in the SAM format.\n\n @see cigar", "name": "oldCigar", "type": ["null", "string"] }, { "default": 0, "doc": "The number of bases in this read/alignment that have been trimmed from the\n start of the read. By default, this is equal to 0. If the value is non-zero,\n that means that the start of the read has been hard-clipped.\n\n @see cigar", "name": "basesTrimmedFromStart", "type": ["int", "null"] }, { "default": 0, "doc": "The number of bases in this read/alignment that have been trimmed from the\n end of the read. By default, this is equal to 0. If the value is non-zero,\n that means that the end of the read has been hard-clipped.\n\n @see cigar", "name": "basesTrimmedFromEnd", "type": ["int", "null"] }, { "default": false, "name": "readPaired", "type": ["boolean", "null"] }, { "default": false, "name": "properPair", "type": ["boolean", "null"] }, { "default": false, "name": "readMapped", "type": ["boolean", "null"] }, { "default": false, "name": "mateMapped", "type": ["boolean", "null"] }, { "default": false, "name": "firstOfPair", "type": ["boolean", "null"] }, { "default": false, "name": "secondOfPair", "type": ["boolean", "null"] }, { "default": false, "name": "failedVendorQualityChecks", "type": ["boolean", "null"] }, { "default": false, "name": "duplicateRead", "type": ["boolean", "null"] }, { "default": false, "doc": "True if this alignment is mapped as a reverse compliment. This field\n defaults to false.", "name": "readNegativeStrand", "type": ["boolean", "null"] }, { "default": false, "doc": "True if the mate pair of this alignment is mapped as a reverse compliment.\n This field defaults to false.", "name": "mateNegativeStrand", "type": ["boolean", "null"] }, { "default": false, "doc": "This field is true if this alignment is either the best linear alignment,\n or the first linear alignment in a chimeric alignment. Defaults to false.\n\n @see secondaryAlignment\n @see supplementaryAlignment", "name": "primaryAlignment", "type": ["boolean", "null"] }, { "default": false, "doc": "This field is true if this alignment is a lower quality linear alignment\n for a multiply-mapped read. Defaults to false.\n\n @see primaryAlignment\n @see supplementaryAlignment", "name": "secondaryAlignment", "type": ["boolean", "null"] }, { "default": false, "doc": "This field is true if this alignment is a non-primary linear alignment in\n a chimeric alignment. Defaults to false.\n\n @see primaryAlignment\n @see secondaryAlignment", "name": "supplementaryAlignment", "type": ["boolean", "null"] }, { "default": null, "name": "mismatchingPositions", "type": ["null", "string"] }, { "default": null, "name": "origQual", "type": ["null", "string"] }, { "default": null, "name": "attributes", "type": ["null", "string"] }, { "default": null, "name": "recordGroupName", "type": ["null", "string"] }, { "default": null, "name": "recordGroupSequencingCenter", "type": ["null", "string"] }, { "default": null, "name": "recordGroupDescription", "type": ["null", "string"] }, { "default": null, "name": "recordGroupRunDateEpoch", "type": ["null", "long"] }, { "default": null, "name": "recordGroupFlowOrder", "type": ["null", "string"] }, { "default": null, "name": "recordGroupKeySequence", "type": ["null", "string"] }, { "default": null, "name": "recordGroupLibrary", "type": ["null", "string"] }, { "default": null, "name": "recordGroupPredictedMedianInsertSize", "type": ["null", "int"] }, { "default": null, "name": "recordGroupPlatform", "type": ["null", "string"] }, { "default": null, "name": "recordGroupPlatformUnit", "type": ["null", "string"] }, { "default": null, "name": "recordGroupSample", "type": ["null", "string"] }, { "default": null, "doc": "The start position of the mate of this read. Should be set to null if the\n mate is unaligned, or if the mate does not exist.", "name": "mateAlignmentStart", "type": ["null", "long"] }, { "default": null, "doc": "The end position of the mate of this read. Should be set to null if the\n mate is unaligned, or if the mate does not exist.", "name": "mateAlignmentEnd", "type": ["null", "long"] }, { "default": null, "doc": "The reference contig of the mate of this read. Should be set to null if the\n mate is unaligned, or if the mate does not exist.", "name": "mateContig", "type": ["null", "Contig"] } ] } ================================================ FILE: examples/avro/schemas/alignment_record_proj.avsc ================================================ { "type": "record", "name": "AlignmentRecord", "fields": [ { "default": null, "doc": "The global mapping quality of this read.", "name": "mapq", "type": ["null", "int"] }, { "default": null, "doc": "The bases in this alignment. If the read has been hard clipped, this may\n not represent all the bases in the original read.", "name": "sequence", "type": ["null", "string"] }, { "default": false, "name": "readMapped", "type": ["boolean", "null"] } ] } ================================================ FILE: examples/avro/schemas/pet.avsc ================================================ { "namespace": "example.avro", "type": "record", "name": "Pet", "fields": [ {"name": "name", "type": "string"}, {"name": "legs", "type": "int"} ] } ================================================ FILE: examples/avro/schemas/stats.avsc ================================================ { "namespace": "example.avro", "type": "record", "name": "Stats", "fields": [ {"name": "office", "type": "string"}, {"name": "counts", "type": {"type": "map", "values": "long"}} ] } ================================================ FILE: examples/avro/schemas/user.avsc ================================================ { "namespace": "example.avro", "type": "record", "name": "User", "fields": [ {"name": "office", "type": "string"}, {"name": "name", "type": "string"}, {"name": "favorite_number", "type": ["int", "null"]}, {"name": "favorite_color", "type": ["string", "null"]} ] } ================================================ FILE: examples/avro/src/main/java/it/crs4/pydoop/WriteKV.java ================================================ /** BEGIN_COPYRIGHT * * Copyright 2009-2026 CRS4. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * END_COPYRIGHT * * Read user data generated by create_input.py and create a key/value * avro file with those users as keys. */ package it.crs4.pydoop; import java.io.File; import java.io.IOException; import java.io.BufferedReader; import java.io.FileReader; import java.util.List; import java.util.ArrayList; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.io.DatumWriter; import org.apache.avro.file.DataFileWriter; import org.apache.avro.hadoop.io.AvroKeyValue; class WriteKV { private static final String DELIMITER = ";"; private static GenericRecord buildUser( Schema schema, String name, String office, String color) { GenericRecord user = new GenericData.Record(schema); user.put("name", name); user.put("office", office); if (color != null) user.put("favorite_color", color); return user; } private static GenericRecord buildPet( Schema schema, String name, Integer legs) { GenericRecord pet = new GenericData.Record(schema); pet.put("name", name); pet.put("legs", legs); return pet; } private static File createFile(File file, Schema schema, T... records) throws IOException { DatumWriter datumWriter = new GenericDatumWriter(schema); DataFileWriter fileWriter = new DataFileWriter(datumWriter); fileWriter.create(schema, file); for (T record: records) { fileWriter.append(record); } fileWriter.close(); return file; } private static File createInputFile( Schema keySchema, Schema valueSchema, String inFN, String outFN ) throws IOException { Schema keyValueSchema = AvroKeyValue.getSchema(keySchema, valueSchema); List records = new ArrayList(); BufferedReader reader = new BufferedReader(new FileReader(inFN)); String line; int i = 0; while ((line = reader.readLine()) != null) { String[] tokens = line.split(DELIMITER); if (tokens.length != 3) { // name, office, color throw new RuntimeException("Bad input format"); } GenericRecord user = buildUser( keySchema, tokens[0], tokens[1], tokens[2] ); GenericRecord pet = buildPet(valueSchema, String.format("pet-%d", i), i); AvroKeyValue kv = new AvroKeyValue( new GenericData.Record(keyValueSchema)); kv.setKey(user); kv.setValue(pet); records.add(kv.get()); i++; } reader.close(); return createFile( new File(outFN), keyValueSchema, records.toArray(new GenericRecord[records.size()]) ); } public static void main(String[] args) throws Exception { if (args.length < 4) { System.err.println( "Usage: WriteKV USER_SCHEMA PET_SCHEMA IN_FILE OUT_FILE" ); System.exit(1); } Schema.Parser parser = new Schema.Parser(); Schema userSchema = parser.parse(new File(args[0])); Schema petSchema = parser.parse(new File(args[1])); File file = createInputFile(userSchema, petSchema, args[2], args[3]); System.out.println("wrote " + file.getName()); } } ================================================ FILE: examples/avro/src/main/java/it/crs4/pydoop/WriteParquet.java ================================================ /* BEGIN_COPYRIGHT * * Copyright 2009-2026 CRS4. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * END_COPYRIGHT * * A MapReduce application that reads ';'-separated text and writes * parquet-avro data (i.e., Parquet files that use the Avro object model). * * Based on Cloudera Parquet examples. */ package it.crs4.pydoop; import java.io.IOException; import java.io.InputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.Mapper.Context; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.parquet.Log; import org.apache.parquet.avro.AvroParquetOutputFormat; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.Schema; public class WriteParquet extends Configured implements Tool { private static final Log LOG = Log.getLog(WriteParquet.class); // FIXME: not needed, we're calling setSchema below private static final String SCHEMA_PATH_KEY = "paexample.schema.path"; private static Schema getSchema(Configuration conf) throws IOException { Path schemaPath = new Path(conf.get(SCHEMA_PATH_KEY)); FileSystem fs = FileSystem.get(conf); InputStream in = fs.open(schemaPath); Schema schema = new Schema.Parser().parse(in); in.close(); return schema; } public static class WriteUserMap extends Mapper { private Schema schema; @Override public void setup(Context context) throws IOException, InterruptedException { schema = getSchema(context.getConfiguration()); } @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { NullWritable outKey = NullWritable.get(); Record user = new Record(schema); String[] elements = value.toString().split(";"); user.put("name", elements[0]); user.put("office", elements[1]); user.put("favorite_color", elements[2]); context.write(null, user); } } public int run(String[] args) throws Exception { if (args.length < 3) { System.err.println( "Usage: WriteParquet " ); return -1; } Path inputPath = new Path(args[0]); Path outputPath = new Path(args[1]); String schemaPathName = args[2]; Configuration conf = getConf(); conf.set(SCHEMA_PATH_KEY, schemaPathName); Schema schema = getSchema(conf); Job job = new Job(conf); job.setJarByClass(getClass()); job.setJobName(getClass().getName()); AvroParquetOutputFormat.setSchema(job, schema); job.setMapperClass(WriteUserMap.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(AvroParquetOutputFormat.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.waitForCompletion(true); return 0; } public static void main(String[] args) throws Exception { try { int res = ToolRunner.run(new Configuration(), new WriteParquet(), args); System.exit(res); } catch (Exception e) { e.printStackTrace(); System.exit(255); } } } ================================================ FILE: examples/avro/write_avro_kv ================================================ #!/bin/bash # args: KEY_SCHEMA_FILE, VALUE_SCHEMA_FILE, CSV_IN_FILE AVRO_OUT_FILE set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/config.sh" pushd "${this_dir}" [ -f "${CLASS_DIR}/it/crs4/pydoop/WriteKV.class" ] || ./build.sh java -cp "${CLASS_DIR}:$(<${CP_PATH})" it.crs4.pydoop.WriteKV $* popd ================================================ FILE: examples/c++/HadoopPipes.cc ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "hadoop/Pipes.hh" #include "hadoop/SerialUtils.hh" #include "hadoop/StringUtils.hh" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using std::map; using std::string; using std::vector; using namespace HadoopUtils; namespace HadoopPipes { class JobConfImpl: public JobConf { private: map values; public: void set(const string& key, const string& value) { values[key] = value; } virtual bool hasKey(const string& key) const { return values.find(key) != values.end(); } virtual const string& get(const string& key) const { map::const_iterator itr = values.find(key); if (itr == values.end()) { throw Error("Key " + key + " not found in JobConf"); } return itr->second; } virtual int getInt(const string& key) const { const string& val = get(key); return toInt(val); } virtual float getFloat(const string& key) const { const string& val = get(key); return toFloat(val); } virtual bool getBoolean(const string&key) const { const string& val = get(key); return toBool(val); } }; class DownwardProtocol { public: virtual void start(int protocol) = 0; virtual void setJobConf(vector values) = 0; virtual void setInputTypes(string keyType, string valueType) = 0; virtual void runMap(string inputSplit, int numReduces, bool pipedInput)= 0; virtual void mapItem(const string& key, const string& value) = 0; virtual void runReduce(int reduce, bool pipedOutput) = 0; virtual void reduceKey(const string& key) = 0; virtual void reduceValue(const string& value) = 0; virtual void close() = 0; virtual void abort() = 0; virtual ~DownwardProtocol() {} }; class UpwardProtocol { public: virtual void output(const string& key, const string& value) = 0; virtual void partitionedOutput(int reduce, const string& key, const string& value) = 0; virtual void status(const string& message) = 0; virtual void progress(float progress) = 0; virtual void done() = 0; virtual void registerCounter(int id, const string& group, const string& name) = 0; virtual void incrementCounter(const TaskContext::Counter* counter, uint64_t amount) = 0; virtual ~UpwardProtocol() {} }; class Protocol { public: virtual void nextEvent() = 0; virtual UpwardProtocol* getUplink() = 0; virtual ~Protocol() {} }; class TextUpwardProtocol: public UpwardProtocol { private: FILE* stream; static const char fieldSeparator = '\t'; static const char lineSeparator = '\n'; void writeBuffer(const string& buffer) { fputs(quoteString(buffer, "\t\n").c_str(), stream); } public: TextUpwardProtocol(FILE* _stream): stream(_stream) {} virtual void output(const string& key, const string& value) { fprintf(stream, "output%c", fieldSeparator); writeBuffer(key); fprintf(stream, "%c", fieldSeparator); writeBuffer(value); fprintf(stream, "%c", lineSeparator); } virtual void partitionedOutput(int reduce, const string& key, const string& value) { fprintf(stream, "parititionedOutput%c%d%c", fieldSeparator, reduce, fieldSeparator); writeBuffer(key); fprintf(stream, "%c", fieldSeparator); writeBuffer(value); fprintf(stream, "%c", lineSeparator); } virtual void status(const string& message) { fprintf(stream, "status%c%s%c", fieldSeparator, message.c_str(), lineSeparator); } virtual void progress(float progress) { fprintf(stream, "progress%c%f%c", fieldSeparator, progress, lineSeparator); } virtual void registerCounter(int id, const string& group, const string& name) { fprintf(stream, "registerCounter%c%d%c%s%c%s%c", fieldSeparator, id, fieldSeparator, group.c_str(), fieldSeparator, name.c_str(), lineSeparator); } virtual void incrementCounter(const TaskContext::Counter* counter, uint64_t amount) { fprintf(stream, "incrCounter%c%d%c%ld%c", fieldSeparator, counter->getId(), fieldSeparator, (long)amount, lineSeparator); } virtual void done() { fprintf(stream, "done%c", lineSeparator); } }; class TextProtocol: public Protocol { private: FILE* downStream; DownwardProtocol* handler; UpwardProtocol* uplink; string key; string value; int readUpto(string& buffer, const char* limit) { int ch; buffer.clear(); while ((ch = getc(downStream)) != -1) { if (strchr(limit, ch) != NULL) { return ch; } buffer += ch; } return -1; } static const char* delim; public: TextProtocol(FILE* down, DownwardProtocol* _handler, FILE* up) { downStream = down; uplink = new TextUpwardProtocol(up); handler = _handler; } UpwardProtocol* getUplink() { return uplink; } virtual void nextEvent() { string command; string arg; int sep; sep = readUpto(command, delim); if (command == "mapItem") { HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(key, delim); HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(value, delim); HADOOP_ASSERT(sep == '\n', "Long text protocol command " + command); handler->mapItem(key, value); } else if (command == "reduceValue") { HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(value, delim); HADOOP_ASSERT(sep == '\n', "Long text protocol command " + command); handler->reduceValue(value); } else if (command == "reduceKey") { HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(key, delim); HADOOP_ASSERT(sep == '\n', "Long text protocol command " + command); handler->reduceKey(key); } else if (command == "start") { HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(arg, delim); HADOOP_ASSERT(sep == '\n', "Long text protocol command " + command); handler->start(toInt(arg)); } else if (command == "setJobConf") { HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(arg, delim); int len = toInt(arg); vector values(len); for(int i=0; i < len; ++i) { HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(arg, delim); values.push_back(arg); } HADOOP_ASSERT(sep == '\n', "Long text protocol command " + command); handler->setJobConf(values); } else if (command == "setInputTypes") { HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(key, delim); HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(value, delim); HADOOP_ASSERT(sep == '\n', "Long text protocol command " + command); handler->setInputTypes(key, value); } else if (command == "runMap") { string split; HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(split, delim); string reduces; HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(reduces, delim); HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(arg, delim); HADOOP_ASSERT(sep == '\n', "Long text protocol command " + command); handler->runMap(split, toInt(reduces), toBool(arg)); } else if (command == "runReduce") { HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); sep = readUpto(arg, delim); HADOOP_ASSERT(sep == '\t', "Short text protocol command " + command); string piped; sep = readUpto(piped, delim); HADOOP_ASSERT(sep == '\n', "Long text protocol command " + command); handler->runReduce(toInt(arg), toBool(piped)); } else if (command == "abort") { HADOOP_ASSERT(sep == '\n', "Long text protocol command " + command); handler->abort(); } else if (command == "close") { HADOOP_ASSERT(sep == '\n', "Long text protocol command " + command); handler->close(); } else { throw Error("Illegal text protocol command " + command); } } ~TextProtocol() { delete uplink; } }; const char* TextProtocol::delim = "\t\n"; enum MESSAGE_TYPE {START_MESSAGE, SET_JOB_CONF, SET_INPUT_TYPES, RUN_MAP, MAP_ITEM, RUN_REDUCE, REDUCE_KEY, REDUCE_VALUE, CLOSE, ABORT, AUTHENTICATION_REQ, OUTPUT=50, PARTITIONED_OUTPUT, STATUS, PROGRESS, DONE, REGISTER_COUNTER, INCREMENT_COUNTER, AUTHENTICATION_RESP}; class BinaryUpwardProtocol: public UpwardProtocol { private: FileOutStream* stream; public: BinaryUpwardProtocol(FILE* _stream) { stream = new FileOutStream(); HADOOP_ASSERT(stream->open(_stream), "problem opening stream"); } virtual void authenticate(const string &responseDigest) { serializeInt(AUTHENTICATION_RESP, *stream); serializeString(responseDigest, *stream); stream->flush(); } virtual void output(const string& key, const string& value) { serializeInt(OUTPUT, *stream); serializeString(key, *stream); serializeString(value, *stream); } virtual void partitionedOutput(int reduce, const string& key, const string& value) { serializeInt(PARTITIONED_OUTPUT, *stream); serializeInt(reduce, *stream); serializeString(key, *stream); serializeString(value, *stream); } virtual void status(const string& message) { serializeInt(STATUS, *stream); serializeString(message, *stream); } virtual void progress(float progress) { serializeInt(PROGRESS, *stream); serializeFloat(progress, *stream); stream->flush(); } virtual void done() { serializeInt(DONE, *stream); } virtual void registerCounter(int id, const string& group, const string& name) { serializeInt(REGISTER_COUNTER, *stream); serializeInt(id, *stream); serializeString(group, *stream); serializeString(name, *stream); } virtual void incrementCounter(const TaskContext::Counter* counter, uint64_t amount) { serializeInt(INCREMENT_COUNTER, *stream); serializeInt(counter->getId(), *stream); serializeLong(amount, *stream); } ~BinaryUpwardProtocol() { delete stream; } }; class BinaryProtocol: public Protocol { private: FileInStream* downStream; DownwardProtocol* handler; BinaryUpwardProtocol * uplink; string key; string value; string password; bool authDone; void getPassword(string &password) { const char *passwordFile = getenv("hadoop.pipes.shared.secret.location"); if (passwordFile == NULL) { return; } std::ifstream fstr(passwordFile, std::fstream::binary); if (fstr.fail()) { std::cerr << "Could not open the password file" << std::endl; return; } unsigned char * passBuff = new unsigned char [512]; fstr.read((char *)passBuff, 512); int passwordLength = fstr.gcount(); fstr.close(); passBuff[passwordLength] = 0; password.replace(0, passwordLength, (const char *) passBuff, passwordLength); delete [] passBuff; return; } void verifyDigestAndRespond(string& digest, string& challenge) { if (password.empty()) { //password can be empty if process is running in debug mode from //command file. authDone = true; return; } if (!verifyDigest(password, digest, challenge)) { std::cerr << "Server failed to authenticate. Exiting" << std::endl; exit(-1); } authDone = true; string responseDigest = createDigest(password, digest); uplink->authenticate(responseDigest); } bool verifyDigest(string &password, string& digest, string& challenge) { string expectedDigest = createDigest(password, challenge); if (digest == expectedDigest) { return true; } else { return false; } } string createDigest(string &password, string& msg) { #if OPENSSL_VERSION_NUMBER < 0x10100000L HMAC_CTX ctx; unsigned char digest[EVP_MAX_MD_SIZE]; HMAC_Init(&ctx, (const unsigned char *)password.c_str(), password.length(), EVP_sha1()); HMAC_Update(&ctx, (const unsigned char *)msg.c_str(), msg.length()); unsigned int digestLen; HMAC_Final(&ctx, digest, &digestLen); HMAC_cleanup(&ctx); #else HMAC_CTX *ctx = HMAC_CTX_new(); unsigned char digest[EVP_MAX_MD_SIZE]; HMAC_Init_ex(ctx, (const unsigned char *)password.c_str(), password.length(), EVP_sha1(), NULL); HMAC_Update(ctx, (const unsigned char *)msg.c_str(), msg.length()); unsigned int digestLen; HMAC_Final(ctx, digest, &digestLen); HMAC_CTX_free(ctx); #endif //now apply base64 encoding BIO *bmem, *b64; BUF_MEM *bptr; b64 = BIO_new(BIO_f_base64()); bmem = BIO_new(BIO_s_mem()); b64 = BIO_push(b64, bmem); BIO_write(b64, digest, digestLen); BIO_flush(b64); BIO_get_mem_ptr(b64, &bptr); char digestBuffer[bptr->length]; memcpy(digestBuffer, bptr->data, bptr->length-1); digestBuffer[bptr->length-1] = 0; BIO_free_all(b64); return string(digestBuffer); } public: BinaryProtocol(FILE* down, DownwardProtocol* _handler, FILE* up) { downStream = new FileInStream(); downStream->open(down); uplink = new BinaryUpwardProtocol(up); handler = _handler; authDone = false; getPassword(password); } UpwardProtocol* getUplink() { return uplink; } virtual void nextEvent() { int32_t cmd; cmd = deserializeInt(*downStream); if (!authDone && cmd != AUTHENTICATION_REQ) { //Authentication request must be the first message if //authentication is not complete std::cerr << "Command:" << cmd << "received before authentication. " << "Exiting.." << std::endl; exit(-1); } switch (cmd) { case AUTHENTICATION_REQ: { string digest; string challenge; deserializeString(digest, *downStream); deserializeString(challenge, *downStream); verifyDigestAndRespond(digest, challenge); break; } case START_MESSAGE: { int32_t prot; prot = deserializeInt(*downStream); handler->start(prot); break; } case SET_JOB_CONF: { int32_t entries; entries = deserializeInt(*downStream); vector result(entries); for(int i=0; i < entries; ++i) { string item; deserializeString(item, *downStream); result.push_back(item); } handler->setJobConf(result); break; } case SET_INPUT_TYPES: { string keyType; string valueType; deserializeString(keyType, *downStream); deserializeString(valueType, *downStream); handler->setInputTypes(keyType, valueType); break; } case RUN_MAP: { string split; int32_t numReduces; int32_t piped; deserializeString(split, *downStream); numReduces = deserializeInt(*downStream); piped = deserializeInt(*downStream); handler->runMap(split, numReduces, piped); break; } case MAP_ITEM: { deserializeString(key, *downStream); deserializeString(value, *downStream); handler->mapItem(key, value); break; } case RUN_REDUCE: { int32_t reduce; int32_t piped; reduce = deserializeInt(*downStream); piped = deserializeInt(*downStream); handler->runReduce(reduce, piped); break; } case REDUCE_KEY: { deserializeString(key, *downStream); handler->reduceKey(key); break; } case REDUCE_VALUE: { deserializeString(value, *downStream); handler->reduceValue(value); break; } case CLOSE: handler->close(); break; case ABORT: handler->abort(); break; default: HADOOP_ASSERT(false, "Unknown binary command " + toString(cmd)); } } virtual ~BinaryProtocol() { delete downStream; delete uplink; } }; /** * Define a context object to give to combiners that will let them * go through the values and emit their results correctly. */ class CombineContext: public ReduceContext { private: ReduceContext* baseContext; Partitioner* partitioner; int numReduces; UpwardProtocol* uplink; bool firstKey; bool firstValue; map >::iterator keyItr; map >::iterator endKeyItr; vector::iterator valueItr; vector::iterator endValueItr; public: CombineContext(ReduceContext* _baseContext, Partitioner* _partitioner, int _numReduces, UpwardProtocol* _uplink, map >& data) { baseContext = _baseContext; partitioner = _partitioner; numReduces = _numReduces; uplink = _uplink; keyItr = data.begin(); endKeyItr = data.end(); firstKey = true; firstValue = true; } virtual const JobConf* getJobConf() { return baseContext->getJobConf(); } virtual const std::string& getInputKey() { return keyItr->first; } virtual const std::string& getInputValue() { return *valueItr; } virtual void emit(const std::string& key, const std::string& value) { if (partitioner != NULL) { uplink->partitionedOutput(partitioner->partition(key, numReduces), key, value); } else { uplink->output(key, value); } } virtual void progress() { baseContext->progress(); } virtual void setStatus(const std::string& status) { baseContext->setStatus(status); } bool nextKey() { if (firstKey) { firstKey = false; } else { ++keyItr; } if (keyItr != endKeyItr) { valueItr = keyItr->second.begin(); endValueItr = keyItr->second.end(); firstValue = true; return true; } return false; } virtual bool nextValue() { if (firstValue) { firstValue = false; } else { ++valueItr; } return valueItr != endValueItr; } virtual Counter* getCounter(const std::string& group, const std::string& name) { return baseContext->getCounter(group, name); } virtual void incrementCounter(const Counter* counter, uint64_t amount) { baseContext->incrementCounter(counter, amount); } }; /** * A RecordWriter that will take the map outputs, buffer them up and then * combine then when the buffer is full. */ class CombineRunner: public RecordWriter { private: map > data; int64_t spillSize; int64_t numBytes; ReduceContext* baseContext; Partitioner* partitioner; int numReduces; UpwardProtocol* uplink; Reducer* combiner; public: CombineRunner(int64_t _spillSize, ReduceContext* _baseContext, Reducer* _combiner, UpwardProtocol* _uplink, Partitioner* _partitioner, int _numReduces) { numBytes = 0; spillSize = _spillSize; baseContext = _baseContext; partitioner = _partitioner; numReduces = _numReduces; uplink = _uplink; combiner = _combiner; } virtual void emit(const std::string& key, const std::string& value) { numBytes += key.length() + value.length(); data[key].push_back(value); if (numBytes >= spillSize) { spillAll(); } } virtual void close() { spillAll(); } private: void spillAll() { CombineContext context(baseContext, partitioner, numReduces, uplink, data); while (context.nextKey()) { combiner->reduce(context); } data.clear(); numBytes = 0; } }; class TaskContextImpl: public MapContext, public ReduceContext, public DownwardProtocol { private: bool done; JobConf* jobConf; string key; const string* newKey; const string* value; bool hasTask; bool isNewKey; bool isNewValue; string* inputKeyClass; string* inputValueClass; string status; float progressFloat; uint64_t lastProgress; bool statusSet; Protocol* protocol; UpwardProtocol *uplink; string* inputSplit; RecordReader* reader; Mapper* mapper; Reducer* reducer; RecordWriter* writer; Partitioner* partitioner; int numReduces; const Factory* factory; pthread_mutex_t mutexDone; std::vector registeredCounterIds; public: TaskContextImpl(const Factory& _factory) { statusSet = false; done = false; newKey = NULL; factory = &_factory; jobConf = NULL; inputKeyClass = NULL; inputValueClass = NULL; inputSplit = NULL; mapper = NULL; reducer = NULL; reader = NULL; writer = NULL; partitioner = NULL; protocol = NULL; isNewKey = false; isNewValue = false; lastProgress = 0; progressFloat = 0.0f; hasTask = false; pthread_mutex_init(&mutexDone, NULL); } void setProtocol(Protocol* _protocol, UpwardProtocol* _uplink) { protocol = _protocol; uplink = _uplink; } virtual void start(int protocol) { if (protocol != 0) { throw Error("Protocol version " + toString(protocol) + " not supported"); } } virtual void setJobConf(vector values) { int len = values.size(); JobConfImpl* result = new JobConfImpl(); HADOOP_ASSERT(len % 2 == 0, "Odd length of job conf values"); for(int i=0; i < len; i += 2) { result->set(values[i], values[i+1]); } jobConf = result; } virtual void setInputTypes(string keyType, string valueType) { inputKeyClass = new string(keyType); inputValueClass = new string(valueType); } virtual void runMap(string _inputSplit, int _numReduces, bool pipedInput) { inputSplit = new string(_inputSplit); reader = factory->createRecordReader(*this); HADOOP_ASSERT((reader == NULL) == pipedInput, pipedInput ? "RecordReader defined when not needed.": "RecordReader not defined"); if (reader != NULL) { value = new string(); } mapper = factory->createMapper(*this); numReduces = _numReduces; if (numReduces != 0) { reducer = factory->createCombiner(*this); partitioner = factory->createPartitioner(*this); } if (reducer != NULL) { int64_t spillSize = 100; if (jobConf->hasKey("mapreduce.task.io.sort.mb")) { spillSize = jobConf->getInt("mapreduce.task.io.sort.mb"); } writer = new CombineRunner(spillSize * 1024 * 1024, this, reducer, uplink, partitioner, numReduces); } hasTask = true; } virtual void mapItem(const string& _key, const string& _value) { newKey = &_key; value = &_value; isNewKey = true; } virtual void runReduce(int reduce, bool pipedOutput) { reducer = factory->createReducer(*this); writer = factory->createRecordWriter(*this); HADOOP_ASSERT((writer == NULL) == pipedOutput, pipedOutput ? "RecordWriter defined when not needed.": "RecordWriter not defined"); hasTask = true; } virtual void reduceKey(const string& _key) { isNewKey = true; newKey = &_key; } virtual void reduceValue(const string& _value) { isNewValue = true; value = &_value; } virtual bool isDone() { pthread_mutex_lock(&mutexDone); bool doneCopy = done; pthread_mutex_unlock(&mutexDone); return doneCopy; } virtual void close() { pthread_mutex_lock(&mutexDone); done = true; pthread_mutex_unlock(&mutexDone); } virtual void abort() { throw Error("Aborted by driver"); } void waitForTask() { while (!done && !hasTask) { protocol->nextEvent(); } } bool nextKey() { if (reader == NULL) { while (!isNewKey) { nextValue(); if (done) { return false; } } key = *newKey; } else { if (!reader->next(key, const_cast(*value))) { pthread_mutex_lock(&mutexDone); done = true; pthread_mutex_unlock(&mutexDone); return false; } progressFloat = reader->getProgress(); } isNewKey = false; if (mapper != NULL) { mapper->map(*this); } else { reducer->reduce(*this); } return true; } /** * Advance to the next value. */ virtual bool nextValue() { if (isNewKey || done) { return false; } isNewValue = false; progress(); protocol->nextEvent(); return isNewValue; } /** * Get the JobConf for the current task. */ virtual JobConf* getJobConf() { return jobConf; } /** * Get the current key. * @return the current key or NULL if called before the first map or reduce */ virtual const string& getInputKey() { return key; } /** * Get the current value. * @return the current value or NULL if called before the first map or * reduce */ virtual const string& getInputValue() { return *value; } /** * Mark your task as having made progress without changing the status * message. */ virtual void progress() { if (uplink != 0) { uint64_t now = getCurrentMillis(); if (now - lastProgress > 1000) { lastProgress = now; if (statusSet) { uplink->status(status); statusSet = false; } uplink->progress(progressFloat); } } } /** * Set the status message and call progress. */ virtual void setStatus(const string& status) { this->status = status; statusSet = true; progress(); } /** * Get the name of the key class of the input to this task. */ virtual const string& getInputKeyClass() { return *inputKeyClass; } /** * Get the name of the value class of the input to this task. */ virtual const string& getInputValueClass() { return *inputValueClass; } /** * Access the InputSplit of the mapper. */ virtual const std::string& getInputSplit() { return *inputSplit; } virtual void emit(const string& key, const string& value) { progress(); if (writer != NULL) { writer->emit(key, value); } else if (partitioner != NULL) { int part = partitioner->partition(key, numReduces); uplink->partitionedOutput(part, key, value); } else { uplink->output(key, value); } } /** * Register a counter with the given group and name. */ virtual Counter* getCounter(const std::string& group, const std::string& name) { int id = registeredCounterIds.size(); registeredCounterIds.push_back(id); uplink->registerCounter(id, group, name); return new Counter(id); } /** * Increment the value of the counter with the given amount. */ virtual void incrementCounter(const Counter* counter, uint64_t amount) { uplink->incrementCounter(counter, amount); } void closeAll() { if (reader) { reader->close(); } if (mapper) { mapper->close(); } if (reducer) { reducer->close(); } if (writer) { writer->close(); } } virtual ~TaskContextImpl() { delete jobConf; delete inputKeyClass; delete inputValueClass; delete inputSplit; if (reader) { delete value; } delete reader; delete mapper; delete reducer; delete writer; delete partitioner; pthread_mutex_destroy(&mutexDone); } }; /** * Ping the parent every 5 seconds to know if it is alive */ void* ping(void* ptr) { TaskContextImpl* context = (TaskContextImpl*) ptr; char* portStr = getenv("mapreduce.pipes.command.port"); int MAX_RETRIES = 3; int remaining_retries = MAX_RETRIES; while (!context->isDone()) { try{ sleep(5); int sock = -1; if (portStr) { sock = socket(PF_INET, SOCK_STREAM, 0); HADOOP_ASSERT(sock != - 1, string("problem creating socket: ") + strerror(errno)); sockaddr_in addr; addr.sin_family = AF_INET; addr.sin_port = htons(toInt(portStr)); addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); HADOOP_ASSERT(connect(sock, (sockaddr*) &addr, sizeof(addr)) == 0, string("problem connecting command socket: ") + strerror(errno)); } if (sock != -1) { int result = shutdown(sock, SHUT_RDWR); HADOOP_ASSERT(result == 0, "problem shutting socket"); result = close(sock); HADOOP_ASSERT(result == 0, "problem closing socket"); } remaining_retries = MAX_RETRIES; } catch (Error& err) { if (!context->isDone()) { fprintf(stderr, "Hadoop Pipes Exception: in ping %s\n", err.getMessage().c_str()); remaining_retries -= 1; if (remaining_retries == 0) { exit(1); } } else { return NULL; } } } return NULL; } /** * Run the assigned task in the framework. * The user's main function should set the various functions using the * set* functions above and then call this. * @return true, if the task succeeded. */ bool runTask(const Factory& factory) { try { TaskContextImpl* context = new TaskContextImpl(factory); Protocol* connection; char* portStr = getenv("mapreduce.pipes.command.port"); int sock = -1; FILE* stream = NULL; FILE* outStream = NULL; char *bufin = NULL; char *bufout = NULL; if (portStr) { sock = socket(PF_INET, SOCK_STREAM, 0); HADOOP_ASSERT(sock != - 1, string("problem creating socket: ") + strerror(errno)); sockaddr_in addr; addr.sin_family = AF_INET; addr.sin_port = htons(toInt(portStr)); addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); HADOOP_ASSERT(connect(sock, (sockaddr*) &addr, sizeof(addr)) == 0, string("problem connecting command socket: ") + strerror(errno)); stream = fdopen(sock, "r"); outStream = fdopen(sock, "w"); // increase buffer size int bufsize = 128*1024; int setbuf; bufin = new char[bufsize]; bufout = new char[bufsize]; setbuf = setvbuf(stream, bufin, _IOFBF, bufsize); HADOOP_ASSERT(setbuf == 0, string("problem with setvbuf for inStream: ") + strerror(errno)); setbuf = setvbuf(outStream, bufout, _IOFBF, bufsize); HADOOP_ASSERT(setbuf == 0, string("problem with setvbuf for outStream: ") + strerror(errno)); connection = new BinaryProtocol(stream, context, outStream); } else if (getenv("mapreduce.pipes.commandfile")) { char* filename = getenv("mapreduce.pipes.commandfile"); string outFilename = filename; outFilename += ".out"; stream = fopen(filename, "r"); outStream = fopen(outFilename.c_str(), "w"); connection = new BinaryProtocol(stream, context, outStream); } else { connection = new TextProtocol(stdin, context, stdout); } context->setProtocol(connection, connection->getUplink()); pthread_t pingThread; pthread_create(&pingThread, NULL, ping, (void*)(context)); context->waitForTask(); while (!context->isDone()) { context->nextKey(); } context->closeAll(); connection->getUplink()->done(); pthread_join(pingThread,NULL); delete context; delete connection; if (stream != NULL) { fflush(stream); } if (outStream != NULL) { fflush(outStream); } fflush(stdout); if (sock != -1) { int result = shutdown(sock, SHUT_RDWR); HADOOP_ASSERT(result == 0, "problem shutting socket"); result = close(sock); HADOOP_ASSERT(result == 0, "problem closing socket"); } if (stream != NULL) { //fclose(stream); } if (outStream != NULL) { //fclose(outStream); } delete[] bufin; delete[] bufout; return true; } catch (Error& err) { fprintf(stderr, "Hadoop Pipes Exception: %s\n", err.getMessage().c_str()); return false; } } } ================================================ FILE: examples/c++/Makefile ================================================ # yum install openssl-devel CXXFLAGS := -pthread -g -pipe -Iinclude LDFLAGS := -pthread LDLIBS := -lcrypto all: wordcount wordcount: wordcount.o StringUtils.o SerialUtils.o HadoopPipes.o $(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS) ================================================ FILE: examples/c++/README.txt ================================================ C++ word count implementation, mostly for comparison purposes. Not run together with other examples and/or tests by default. Includes the C++ pipes source so that we can just build and link everything together into the executable task implementation. Requirements: openssl dev version (e.g., yum install openssl-devel). NOTE: the map function splits input values on space chars, unlike the Java and Python versions, which split on multiple whitespace chars. This can lead to a slightly different output, depending on the input text. ================================================ FILE: examples/c++/SerialUtils.cc ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "hadoop/SerialUtils.hh" #include "hadoop/StringUtils.hh" #include #include #include #include #include using std::string; namespace HadoopUtils { Error::Error(const std::string& msg): error(msg) { } Error::Error(const std::string& msg, const std::string& file, int line, const std::string& function) { error = msg + " at " + file + ":" + toString(line) + " in " + function; } const std::string& Error::getMessage() const { return error; } FileInStream::FileInStream() { mFile = NULL; isOwned = false; } bool FileInStream::open(const std::string& name) { mFile = fopen(name.c_str(), "rb"); isOwned = true; return (mFile != NULL); } bool FileInStream::open(FILE* file) { mFile = file; isOwned = false; return (mFile != NULL); } void FileInStream::read(void *buf, size_t len) { size_t result = fread(buf, len, 1, mFile); if (result == 0) { if (feof(mFile)) { HADOOP_ASSERT(false, "end of file"); } else { HADOOP_ASSERT(false, string("read error on file: ") + strerror(errno)); } } } bool FileInStream::skip(size_t nbytes) { return (0==fseek(mFile, nbytes, SEEK_CUR)); } bool FileInStream::close() { int ret = 0; if (mFile != NULL && isOwned) { ret = fclose(mFile); } mFile = NULL; return (ret==0); } FileInStream::~FileInStream() { if (mFile != NULL) { close(); } } FileOutStream::FileOutStream() { mFile = NULL; isOwned = false; } bool FileOutStream::open(const std::string& name, bool overwrite) { if (!overwrite) { mFile = fopen(name.c_str(), "rb"); if (mFile != NULL) { fclose(mFile); return false; } } mFile = fopen(name.c_str(), "wb"); isOwned = true; return (mFile != NULL); } bool FileOutStream::open(FILE* file) { mFile = file; isOwned = false; return (mFile != NULL); } void FileOutStream::write(const void* buf, size_t len) { size_t result = fwrite(buf, len, 1, mFile); HADOOP_ASSERT(result == 1, string("write error to file: ") + strerror(errno)); } bool FileOutStream::advance(size_t nbytes) { return (0==fseek(mFile, nbytes, SEEK_CUR)); } bool FileOutStream::close() { int ret = 0; if (mFile != NULL && isOwned) { ret = fclose(mFile); } mFile = NULL; return (ret == 0); } void FileOutStream::flush() { fflush(mFile); } FileOutStream::~FileOutStream() { if (mFile != NULL) { close(); } } StringInStream::StringInStream(const std::string& str): buffer(str) { itr = buffer.begin(); } void StringInStream::read(void *buf, size_t buflen) { size_t bytes = 0; char* output = (char*) buf; std::string::const_iterator end = buffer.end(); while (bytes < buflen) { output[bytes++] = *itr; ++itr; if (itr == end) { break; } } HADOOP_ASSERT(bytes == buflen, "unexpected end of string reached"); } void serializeInt(int32_t t, OutStream& stream) { serializeLong(t,stream); } void serializeLong(int64_t t, OutStream& stream) { if (t >= -112 && t <= 127) { int8_t b = t; stream.write(&b, 1); return; } int8_t len = -112; if (t < 0) { t ^= -1ll; // reset the sign bit len = -120; } uint64_t tmp = t; while (tmp != 0) { tmp = tmp >> 8; len--; } stream.write(&len, 1); len = (len < -120) ? -(len + 120) : -(len + 112); for (uint32_t idx = len; idx != 0; idx--) { uint32_t shiftbits = (idx - 1) * 8; uint64_t mask = 0xFFll << shiftbits; uint8_t b = (t & mask) >> shiftbits; stream.write(&b, 1); } } int32_t deserializeInt(InStream& stream) { return deserializeLong(stream); } int64_t deserializeLong(InStream& stream) { int8_t b; stream.read(&b, 1); if (b >= -112) { return b; } bool negative; int len; if (b < -120) { negative = true; len = -120 - b; } else { negative = false; len = -112 - b; } uint8_t barr[len]; stream.read(barr, len); int64_t t = 0; for (int idx = 0; idx < len; idx++) { t = t << 8; t |= (barr[idx] & 0xFF); } if (negative) { t ^= -1ll; } return t; } void serializeFloat(float t, OutStream& stream) { char buf[sizeof(float)]; XDR xdrs; xdrmem_create(&xdrs, buf, sizeof(float), XDR_ENCODE); xdr_float(&xdrs, &t); stream.write(buf, sizeof(float)); } float deserializeFloat(InStream& stream) { float f; deserializeFloat(f, stream); return f; } void deserializeFloat(float& t, InStream& stream) { char buf[sizeof(float)]; stream.read(buf, sizeof(float)); XDR xdrs; xdrmem_create(&xdrs, buf, sizeof(float), XDR_DECODE); xdr_float(&xdrs, &t); } void serializeString(const std::string& t, OutStream& stream) { serializeInt(t.length(), stream); if (t.length() > 0) { stream.write(t.data(), t.length()); } } void deserializeString(std::string& t, InStream& stream) { int32_t len = deserializeInt(stream); if (len > 0) { // resize the string to the right length t.resize(len); // read into the string in 64k chunks const int bufSize = 65536; int offset = 0; char buf[bufSize]; while (len > 0) { int chunkLength = len > bufSize ? bufSize : len; stream.read(buf, chunkLength); t.replace(offset, chunkLength, buf, chunkLength); offset += chunkLength; len -= chunkLength; } } else { t.clear(); } } } ================================================ FILE: examples/c++/StringUtils.cc ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "hadoop/StringUtils.hh" #include "hadoop/SerialUtils.hh" #include #include #include #include #include #include #include using std::string; using std::vector; namespace HadoopUtils { string toString(int32_t x) { char str[100]; sprintf(str, "%d", x); return str; } int toInt(const string& val) { int result; char trash; int num = sscanf(val.c_str(), "%d%c", &result, &trash); HADOOP_ASSERT(num == 1, "Problem converting " + val + " to integer."); return result; } float toFloat(const string& val) { float result; char trash; int num = sscanf(val.c_str(), "%f%c", &result, &trash); HADOOP_ASSERT(num == 1, "Problem converting " + val + " to float."); return result; } bool toBool(const string& val) { if (val == "true") { return true; } else if (val == "false") { return false; } else { HADOOP_ASSERT(false, "Problem converting " + val + " to boolean."); } } /** * Get the current time in the number of milliseconds since 1970. */ uint64_t getCurrentMillis() { struct timeval tv; struct timezone tz; int sys = gettimeofday(&tv, &tz); HADOOP_ASSERT(sys != -1, strerror(errno)); return tv.tv_sec * 1000 + tv.tv_usec / 1000; } vector splitString(const std::string& str, const char* separator) { vector result; string::size_type prev_pos=0; string::size_type pos=0; while ((pos = str.find_first_of(separator, prev_pos)) != string::npos) { if (prev_pos < pos) { result.push_back(str.substr(prev_pos, pos-prev_pos)); } prev_pos = pos + 1; } if (prev_pos < str.size()) { result.push_back(str.substr(prev_pos)); } return result; } string quoteString(const string& str, const char* deliminators) { string result(str); for(int i=result.length() -1; i >= 0; --i) { char ch = result[i]; if (!isprint(ch) || ch == '\\' || strchr(deliminators, ch)) { switch (ch) { case '\\': result.replace(i, 1, "\\\\"); break; case '\t': result.replace(i, 1, "\\t"); break; case '\n': result.replace(i, 1, "\\n"); break; case ' ': result.replace(i, 1, "\\s"); break; default: char buff[4]; sprintf(buff, "\\%02x", static_cast(result[i])); result.replace(i, 1, buff); } } } return result; } string unquoteString(const string& str) { string result(str); string::size_type current = result.find('\\'); while (current != string::npos) { if (current + 1 < result.size()) { char new_ch; int num_chars; if (isxdigit(result[current+1])) { num_chars = 2; HADOOP_ASSERT(current + num_chars < result.size(), "escape pattern \\ is missing second digit in '" + str + "'"); char sub_str[3]; sub_str[0] = result[current+1]; sub_str[1] = result[current+2]; sub_str[2] = '\0'; char* end_ptr = NULL; long int int_val = strtol(sub_str, &end_ptr, 16); HADOOP_ASSERT(*end_ptr == '\0' && int_val >= 0, "escape pattern \\ is broken in '" + str + "'"); new_ch = static_cast(int_val); } else { num_chars = 1; switch(result[current+1]) { case '\\': new_ch = '\\'; break; case 't': new_ch = '\t'; break; case 'n': new_ch = '\n'; break; case 's': new_ch = ' '; break; default: string msg("unknow n escape character '"); msg += result[current+1]; HADOOP_ASSERT(false, msg + "' found in '" + str + "'"); } } result.replace(current, 1 + num_chars, 1, new_ch); current = result.find('\\', current+1); } else { HADOOP_ASSERT(false, "trailing \\ in '" + str + "'"); } } return result; } } ================================================ FILE: examples/c++/include/hadoop/Pipes.hh ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef HADOOP_PIPES_HH #define HADOOP_PIPES_HH #ifdef SWIG %module (directors="1") HadoopPipes %include "std_string.i" %feature("director") Mapper; %feature("director") Reducer; %feature("director") Partitioner; %feature("director") RecordReader; %feature("director") RecordWriter; %feature("director") Factory; #else #include #endif #include namespace HadoopPipes { /** * This interface defines the interface between application code and the * foreign code interface to Hadoop Map/Reduce. */ /** * A JobConf defines the properties for a job. */ class JobConf { public: virtual bool hasKey(const std::string& key) const = 0; virtual const std::string& get(const std::string& key) const = 0; virtual int getInt(const std::string& key) const = 0; virtual float getFloat(const std::string& key) const = 0; virtual bool getBoolean(const std::string&key) const = 0; virtual ~JobConf() {} }; /** * Task context provides the information about the task and job. */ class TaskContext { public: /** * Counter to keep track of a property and its value. */ class Counter { private: int id; public: Counter(int counterId) : id(counterId) {} Counter(const Counter& counter) : id(counter.id) {} int getId() const { return id; } }; /** * Get the JobConf for the current task. */ virtual const JobConf* getJobConf() = 0; /** * Get the current key. * @return the current key */ virtual const std::string& getInputKey() = 0; /** * Get the current value. * @return the current value */ virtual const std::string& getInputValue() = 0; /** * Generate an output record */ virtual void emit(const std::string& key, const std::string& value) = 0; /** * Mark your task as having made progress without changing the status * message. */ virtual void progress() = 0; /** * Set the status message and call progress. */ virtual void setStatus(const std::string& status) = 0; /** * Register a counter with the given group and name. */ virtual Counter* getCounter(const std::string& group, const std::string& name) = 0; /** * Increment the value of the counter with the given amount. */ virtual void incrementCounter(const Counter* counter, uint64_t amount) = 0; virtual ~TaskContext() {} }; class MapContext: public TaskContext { public: /** * Access the InputSplit of the mapper. */ virtual const std::string& getInputSplit() = 0; /** * Get the name of the key class of the input to this task. */ virtual const std::string& getInputKeyClass() = 0; /** * Get the name of the value class of the input to this task. */ virtual const std::string& getInputValueClass() = 0; }; class ReduceContext: public TaskContext { public: /** * Advance to the next value. */ virtual bool nextValue() = 0; }; class Closable { public: virtual void close() {} virtual ~Closable() {} }; /** * The application's mapper class to do map. */ class Mapper: public Closable { public: virtual void map(MapContext& context) = 0; }; /** * The application's reducer class to do reduce. */ class Reducer: public Closable { public: virtual void reduce(ReduceContext& context) = 0; }; /** * User code to decide where each key should be sent. */ class Partitioner { public: virtual int partition(const std::string& key, int numOfReduces) = 0; virtual ~Partitioner() {} }; /** * For applications that want to read the input directly for the map function * they can define RecordReaders in C++. */ class RecordReader: public Closable { public: virtual bool next(std::string& key, std::string& value) = 0; /** * The progress of the record reader through the split as a value between * 0.0 and 1.0. */ virtual float getProgress() = 0; }; /** * An object to write key/value pairs as they are emited from the reduce. */ class RecordWriter: public Closable { public: virtual void emit(const std::string& key, const std::string& value) = 0; }; /** * A factory to create the necessary application objects. */ class Factory { public: virtual Mapper* createMapper(MapContext& context) const = 0; virtual Reducer* createReducer(ReduceContext& context) const = 0; /** * Create a combiner, if this application has one. * @return the new combiner or NULL, if one is not needed */ virtual Reducer* createCombiner(MapContext& context) const { return NULL; } /** * Create an application partitioner object. * @return the new partitioner or NULL, if the default partitioner should be * used. */ virtual Partitioner* createPartitioner(MapContext& context) const { return NULL; } /** * Create an application record reader. * @return the new RecordReader or NULL, if the Java RecordReader should be * used. */ virtual RecordReader* createRecordReader(MapContext& context) const { return NULL; } /** * Create an application record writer. * @return the new RecordWriter or NULL, if the Java RecordWriter should be * used. */ virtual RecordWriter* createRecordWriter(ReduceContext& context) const { return NULL; } virtual ~Factory() {} }; /** * Run the assigned task in the framework. * The user's main function should set the various functions using the * set* functions above and then call this. * @return true, if the task succeeded. */ bool runTask(const Factory& factory); } #endif ================================================ FILE: examples/c++/include/hadoop/SerialUtils.hh ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef HADOOP_SERIAL_UTILS_HH #define HADOOP_SERIAL_UTILS_HH #include #include namespace HadoopUtils { /** * A simple exception class that records a message for the user. */ class Error { private: std::string error; public: /** * Create an error object with the given message. */ Error(const std::string& msg); /** * Construct an error object with the given message that was created on * the given file, line, and functino. */ Error(const std::string& msg, const std::string& file, int line, const std::string& function); /** * Get the error message. */ const std::string& getMessage() const; }; /** * Check to make sure that the condition is true, and throw an exception * if it is not. The exception will contain the message and a description * of the source location. */ #define HADOOP_ASSERT(CONDITION, MESSAGE) \ { \ if (!(CONDITION)) { \ throw HadoopUtils::Error((MESSAGE), __FILE__, __LINE__, \ __func__); \ } \ } /** * An interface for an input stream. */ class InStream { public: /** * Reads len bytes from the stream into the buffer. * @param buf the buffer to read into * @param buflen the length of the buffer * @throws Error if there are problems reading */ virtual void read(void *buf, size_t len) = 0; virtual ~InStream() {} }; /** * An interface for an output stream. */ class OutStream { public: /** * Write the given buffer to the stream. * @param buf the data to write * @param len the number of bytes to write * @throws Error if there are problems writing */ virtual void write(const void *buf, size_t len) = 0; /** * Flush the data to the underlying store. */ virtual void flush() = 0; virtual ~OutStream() {} }; /** * A class to read a file as a stream. */ class FileInStream : public InStream { public: FileInStream(); bool open(const std::string& name); bool open(FILE* file); void read(void *buf, size_t buflen); bool skip(size_t nbytes); bool close(); virtual ~FileInStream(); private: /** * The file to write to. */ FILE *mFile; /** * Does is this class responsible for closing the FILE*? */ bool isOwned; }; /** * A class to write a stream to a file. */ class FileOutStream: public OutStream { public: /** * Create a stream that isn't bound to anything. */ FileOutStream(); /** * Create the given file, potentially overwriting an existing file. */ bool open(const std::string& name, bool overwrite); bool open(FILE* file); void write(const void* buf, size_t len); bool advance(size_t nbytes); void flush(); bool close(); virtual ~FileOutStream(); private: FILE *mFile; bool isOwned; }; /** * A stream that reads from a string. */ class StringInStream: public InStream { public: StringInStream(const std::string& str); virtual void read(void *buf, size_t buflen); private: const std::string& buffer; std::string::const_iterator itr; }; void serializeInt(int32_t t, OutStream& stream); int32_t deserializeInt(InStream& stream); void serializeLong(int64_t t, OutStream& stream); int64_t deserializeLong(InStream& stream); void serializeFloat(float t, OutStream& stream); void deserializeFloat(float& t, InStream& stream); float deserializeFloat(InStream& stream); void serializeString(const std::string& t, OutStream& stream); void deserializeString(std::string& t, InStream& stream); } #endif ================================================ FILE: examples/c++/include/hadoop/StringUtils.hh ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef HADOOP_STRING_UTILS_HH #define HADOOP_STRING_UTILS_HH #include #include #include namespace HadoopUtils { /** * Convert an integer to a string. */ std::string toString(int32_t x); /** * Convert a string to an integer. * @throws Error if the string is not a valid integer */ int32_t toInt(const std::string& val); /** * Convert the string to a float. * @throws Error if the string is not a valid float */ float toFloat(const std::string& val); /** * Convert the string to a boolean. * @throws Error if the string is not a valid boolean value */ bool toBool(const std::string& val); /** * Get the current time in the number of milliseconds since 1970. */ uint64_t getCurrentMillis(); /** * Split a string into "words". Multiple deliminators are treated as a single * word break, so no zero-length words are returned. * @param str the string to split * @param separator a list of characters that divide words */ std::vector splitString(const std::string& str, const char* separator); /** * Quote a string to avoid "\", non-printable characters, and the * deliminators. * @param str the string to quote * @param deliminators the set of characters to always quote */ std::string quoteString(const std::string& str, const char* deliminators); /** * Unquote the given string to return the original string. * @param str the string to unquote */ std::string unquoteString(const std::string& str); } #endif ================================================ FILE: examples/c++/include/hadoop/TemplateFactory.hh ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef HADOOP_PIPES_TEMPLATE_FACTORY_HH #define HADOOP_PIPES_TEMPLATE_FACTORY_HH namespace HadoopPipes { template class TemplateFactory2: public Factory { public: Mapper* createMapper(MapContext& context) const { return new mapper(context); } Reducer* createReducer(ReduceContext& context) const { return new reducer(context); } }; template class TemplateFactory3: public TemplateFactory2 { public: Partitioner* createPartitioner(MapContext& context) const { return new partitioner(context); } }; template class TemplateFactory3 : public TemplateFactory2 { }; template class TemplateFactory4 : public TemplateFactory3{ public: Reducer* createCombiner(MapContext& context) const { return new combiner(context); } }; template class TemplateFactory4 : public TemplateFactory3{ }; template class TemplateFactory5 : public TemplateFactory4{ public: RecordReader* createRecordReader(MapContext& context) const { return new recordReader(context); } }; template class TemplateFactory5 : public TemplateFactory4{ }; template class TemplateFactory : public TemplateFactory5{ public: RecordWriter* createRecordWriter(ReduceContext& context) const { return new recordWriter(context); } }; template class TemplateFactory : public TemplateFactory5{ }; } #endif ================================================ FILE: examples/c++/wordcount.cc ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT #include #include #include #include #include "hadoop/Pipes.hh" #include "hadoop/TemplateFactory.hh" #include "hadoop/StringUtils.hh" # define INT64_SIZE sizeof(int64_t) int64_t deserializeLongWritable(std::string s) { int64_t rval = 0; if (s.size() < INT64_SIZE) { throw std::invalid_argument("not enough bytes"); } for (std::size_t i = 0; i < INT64_SIZE; ++i) { rval = (rval << INT64_SIZE) | static_cast(s[i]); } return rval; } class Mapper: public HadoopPipes::Mapper { public: Mapper(HadoopPipes::TaskContext &context) { } void map(HadoopPipes::MapContext &context) { int64_t key = deserializeLongWritable(context.getInputKey()); std::cerr << "key (ignored): " << key << "\n"; std::stringstream ss(context.getInputValue()); std::string item; while (std::getline(ss, item, ' ')) { context.emit(item, "1"); } } }; class Reducer: public HadoopPipes::Reducer { public: Reducer(HadoopPipes::TaskContext &context) { } void reduce(HadoopPipes::ReduceContext &context) { int sum = 0; while (context.nextValue()) { sum += HadoopUtils::toInt(context.getInputValue()); } context.emit(context.getInputKey(), HadoopUtils::toString(sum)); } }; int main(int argc, char *argv[]) { return HadoopPipes::runTask(HadoopPipes::TemplateFactory()); } ================================================ FILE: examples/config.sh ================================================ [ -n "${PYDOOP_EXAMPLES:-}" ] && return || readonly PYDOOP_EXAMPLES=1 die() { echo $1 1>&2 exit 1 } export USER="${USER:-$(whoami)}" export HADOOP="${HADOOP:-hadoop}" export HDFS="${HDFS:-hdfs}" export MAPRED="${MAPRED:-mapred}" export YARN="${YARN:-yarn}" export PYTHON="${PYTHON:-python}" export PY_VER=$("${PYTHON}" -c 'import sys; print(sys.version_info[0])') export PYDOOP="pydoop${PY_VER}" ensure_dfs_home() { ${HDFS} dfs -mkdir -p /user/${USER} } hadoop_fs() { ${HDFS} getconf -confKey fs.defaultFS | cut -d : -f 1 } export -f die ensure_dfs_home hadoop_fs ================================================ FILE: examples/hdfs/common.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import os MB = 2**20 TEST_ROOT = os.getenv("TEST_ROOT", "pydoop_test_tree") def isdir(fs, d): try: info = fs.get_path_info(d) except IOError: return False return info['kind'] == 'directory' ================================================ FILE: examples/hdfs/repl_session.py ================================================ """\ # DOCS_INCLUDE_START >>> import pydoop.hdfs as hdfs >>> hdfs.mkdir('test') >>> hdfs.dump('hello, world', 'test/hello.txt') >>> hdfs.load('test/hello.txt') b'hello, world' >>> hdfs.load('test/hello.txt', mode='rt') 'hello, world' >>> [hdfs.path.basename(_) for _ in hdfs.ls('test')] ['hello.txt'] >>> hdfs.stat('test/hello.txt').st_size 12 >>> hdfs.path.isdir('test') True >>> hdfs.path.isfile('test') False >>> hdfs.path.basename('test/hello.txt') 'hello.txt' >>> hdfs.cp('test', 'test.copy') >>> [hdfs.path.basename(_) for _ in hdfs.ls('test.copy')] ['hello.txt'] >>> hdfs.get('test/hello.txt', '/tmp/hello.txt') >>> with open('/tmp/hello.txt') as f: ... f.read() ... 'hello, world' >>> hdfs.put('/tmp/hello.txt', 'test.copy/hello.txt.copy') >>> for x in sorted(hdfs.ls('test.copy')): print(repr(hdfs.path.basename(x))) ... 'hello.txt' 'hello.txt.copy' >>> with hdfs.open('test/hello.txt', 'r') as fi: ... fi.read(3) ... b'hel' >>> with hdfs.open('test/hello.txt', 'rt') as fi: ... fi.read(3) ... 'hel' # DOCS_INCLUDE_END """ def clean(): for path in "test", "test.copy", "file:/tmp/hello.txt": try: hdfs.rm(path) except OSError: pass if __name__ == "__main__": import doctest import pydoop.hdfs as hdfs clean() doctest.testmod(verbose=True) clean() ================================================ FILE: examples/hdfs/run ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" export TEST_ROOT="${TEST_ROOT:-pydoop_test_tree}" DEPTH=${1:-3} SPAN=${2:-4} if [ "$(hadoop_fs)" != "file" ]; then echo "Waiting for HDFS to exit safe mode..." "${HDFS}" dfsadmin -safemode wait fi WD=$(mktemp -d) pushd "${WD}" echo "Generating tree (depth=${DEPTH}, span=${SPAN})..." "${PYTHON}" "${this_dir}"/treegen.py ${DEPTH} ${SPAN} echo "Computing usage by block size..." "${PYTHON}" "${this_dir}"/treewalk.py echo "Cleaning up..." ${HDFS} dfs -rm -r -f "${TEST_ROOT}" if (( ${PY_VER} >= 3 )); then echo "Checking REPL example..." ${PYTHON} "${this_dir}"/repl_session.py fi popd rm -rf "${WD}" ================================================ FILE: examples/hdfs/treegen.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Generate an HDFS tree containing files of different block size. """ import sys import random import pydoop.hdfs as hdfs from common import isdir, MB, TEST_ROOT BS_RANGE = [_ * MB for _ in range(50, 101, 10)] def treegen(fs, root, depth, span): if isdir(fs, root) and depth > 0: for i in range(span): path = u"%s/%d_%d" % (root, depth, i) kind = 'file' if i else 'directory' if kind == 'file': bs = random.sample(BS_RANGE, 1)[0] sys.stderr.write( "%s %s %d\n" % (kind[0].upper(), path, (bs / MB)) ) with fs.open_file(path, "wt", blocksize=bs) as f: f.write(path) else: sys.stderr.write("%s %s 0\n" % (kind[0].upper(), path)) fs.create_directory(path) treegen(fs, path, depth - 1, span) def main(argv): try: depth = int(argv[1]) span = int(argv[2]) except IndexError: print("Usage: python %s DEPTH SPAN" % argv[0]) sys.exit(2) fs = hdfs.hdfs() try: root = "%s/%s" % (fs.working_directory(), TEST_ROOT) try: fs.delete(root) except IOError: pass fs.create_directory(root) treegen(fs, root, depth, span) finally: fs.close() if __name__ == "__main__": main(sys.argv) ================================================ FILE: examples/hdfs/treewalk.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Traverse an HDFS tree and output disk space usage by block size. """ # DOCS_INCLUDE_START import pydoop.hdfs as hdfs from common import MB, TEST_ROOT def usage_by_bs(fs, root): stats = {} for info in fs.walk(root): if info['kind'] == 'directory': continue bs = int(info['block_size']) size = int(info['size']) stats[bs] = stats.get(bs, 0) + size return stats if __name__ == "__main__": with hdfs.hdfs() as fs: root = "%s/%s" % (fs.working_directory(), TEST_ROOT) print("BS(MB)\tBYTES") for k, v in usage_by_bs(fs, root).items(): print("%.1f\t%d" % (k / float(MB), v)) ================================================ FILE: examples/input/alice_1.txt ================================================ Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: Alice's Adventures in Wonderland Author: Lewis Carroll Posting Date: June 25, 2008 [EBook #11] Release Date: March, 1994 Language: English Character set encoding: ASCII *** START OF THIS PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND *** ALICE'S ADVENTURES IN WONDERLAND Lewis Carroll THE MILLENNIUM FULCRUM EDITION 3.0 CHAPTER I. Down the Rabbit-Hole Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?' So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her. There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually TOOK A WATCH OUT OF ITS WAISTCOAT-POCKET, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge. In another moment down went Alice after it, never once considering how in the world she was to get out again. The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well. Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it. 'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.) Down, down, down. Would the fall NEVER come to an end! 'I wonder how many miles I've fallen by this time?' she said aloud. 'I must be getting somewhere near the centre of the earth. Let me see: that would be four thousand miles down, I think--' (for, you see, Alice had learnt several things of this sort in her lessons in the schoolroom, and though this was not a VERY good opportunity for showing off her knowledge, as there was no one to listen to her, still it was good practice to say it over) '--yes, that's about the right distance--but then I wonder what Latitude or Longitude I've got to?' (Alice had no idea what Latitude was, or Longitude either, but thought they were nice grand words to say.) Presently she began again. 'I wonder if I shall fall right THROUGH the earth! How funny it'll seem to come out among the people that walk with their heads downward! The Antipathies, I think--' (she was rather glad there WAS no one listening, this time, as it didn't sound at all the right word) '--but I shall have to ask them what the name of the country is, you know. Please, Ma'am, is this New Zealand or Australia?' (and she tried to curtsey as she spoke--fancy CURTSEYING as you're falling through the air! Do you think you could manage it?) 'And what an ignorant little girl she'll think me for asking! No, it'll never do to ask: perhaps I shall see it written up somewhere.' Down, down, down. There was nothing else to do, so Alice soon began talking again. 'Dinah'll miss me very much to-night, I should think!' (Dinah was the cat.) 'I hope they'll remember her saucer of milk at tea-time. Dinah my dear! I wish you were down here with me! There are no mice in the air, I'm afraid, but you might catch a bat, and that's very like a mouse, you know. But do cats eat bats, I wonder?' And here Alice began to get rather sleepy, and went on saying to herself, in a dreamy sort of way, 'Do cats eat bats? Do cats eat bats?' and sometimes, 'Do bats eat cats?' for, you see, as she couldn't answer either question, it didn't much matter which way she put it. She felt that she was dozing off, and had just begun to dream that she was walking hand in hand with Dinah, and saying to her very earnestly, 'Now, Dinah, tell me the truth: did you ever eat a bat?' when suddenly, thump! thump! down she came upon a heap of sticks and dry leaves, and the fall was over. Alice was not a bit hurt, and she jumped up on to her feet in a moment: she looked up, but it was all dark overhead; before her was another long passage, and the White Rabbit was still in sight, hurrying down it. There was not a moment to be lost: away went Alice like the wind, and was just in time to hear it say, as it turned a corner, 'Oh my ears and whiskers, how late it's getting!' She was close behind it when she turned the corner, but the Rabbit was no longer to be seen: she found herself in a long, low hall, which was lit up by a row of lamps hanging from the roof. There were doors all round the hall, but they were all locked; and when Alice had been all the way down one side and up the other, trying every door, she walked sadly down the middle, wondering how she was ever to get out again. Suddenly she came upon a little three-legged table, all made of solid glass; there was nothing on it except a tiny golden key, and Alice's first thought was that it might belong to one of the doors of the hall; but, alas! either the locks were too large, or the key was too small, but at any rate it would not open any of them. However, on the second time round, she came upon a low curtain she had not noticed before, and behind it was a little door about fifteen inches high: she tried the little golden key in the lock, and to her great delight it fitted! Alice opened the door and found that it led into a small passage, not much larger than a rat-hole: she knelt down and looked along the passage into the loveliest garden you ever saw. How she longed to get out of that dark hall, and wander about among those beds of bright flowers and those cool fountains, but she could not even get her head through the doorway; 'and even if my head would go through,' thought poor Alice, 'it would be of very little use without my shoulders. Oh, how I wish I could shut up like a telescope! I think I could, if I only know how to begin.' For, you see, so many out-of-the-way things had happened lately, that Alice had begun to think that very few things indeed were really impossible. There seemed to be no use in waiting by the little door, so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes: this time she found a little bottle on it, ('which certainly was not here before,' said Alice,) and round the neck of the bottle was a paper label, with the words 'DRINK ME' beautifully printed on it in large letters. It was all very well to say 'Drink me,' but the wise little Alice was not going to do THAT in a hurry. 'No, I'll look first,' she said, 'and see whether it's marked "poison" or not'; for she had read several nice little histories about children who had got burnt, and eaten up by wild beasts and other unpleasant things, all because they WOULD not remember the simple rules their friends had taught them: such as, that a red-hot poker will burn you if you hold it too long; and that if you cut your finger VERY deeply with a knife, it usually bleeds; and she had never forgotten that, if you drink much from a bottle marked 'poison,' it is almost certain to disagree with you, sooner or later. However, this bottle was NOT marked 'poison,' so Alice ventured to taste it, and finding it very nice, (it had, in fact, a sort of mixed flavour of cherry-tart, custard, pine-apple, roast turkey, toffee, and hot buttered toast,) she very soon finished it off. * * * * * * * * * * * * * * * * * * * * 'What a curious feeling!' said Alice; 'I must be shutting up like a telescope.' And so it was indeed: she was now only ten inches high, and her face brightened up at the thought that she was now the right size for going through the little door into that lovely garden. First, however, she waited for a few minutes to see if she was going to shrink any further: she felt a little nervous about this; 'for it might end, you know,' said Alice to herself, 'in my going out altogether, like a candle. I wonder what I should be like then?' And she tried to fancy what the flame of a candle is like after the candle is blown out, for she could not remember ever having seen such a thing. After a while, finding that nothing more happened, she decided on going into the garden at once; but, alas for poor Alice! when she got to the door, she found she had forgotten the little golden key, and when she went back to the table for it, she found she could not possibly reach it: she could see it quite plainly through the glass, and she tried her best to climb up one of the legs of the table, but it was too slippery; and when she had tired herself out with trying, the poor little thing sat down and cried. 'Come, there's no use in crying like that!' said Alice to herself, rather sharply; 'I advise you to leave off this minute!' She generally gave herself very good advice, (though she very seldom followed it), and sometimes she scolded herself so severely as to bring tears into her eyes; and once she remembered trying to box her own ears for having cheated herself in a game of croquet she was playing against herself, for this curious child was very fond of pretending to be two people. 'But it's no use now,' thought poor Alice, 'to pretend to be two people! Why, there's hardly enough of me left to make ONE respectable person!' Soon her eye fell on a little glass box that was lying under the table: she opened it, and found in it a very small cake, on which the words 'EAT ME' were beautifully marked in currants. 'Well, I'll eat it,' said Alice, 'and if it makes me grow larger, I can reach the key; and if it makes me grow smaller, I can creep under the door; so either way I'll get into the garden, and I don't care which happens!' She ate a little bit, and said anxiously to herself, 'Which way? Which way?', holding her hand on the top of her head to feel which way it was growing, and she was quite surprised to find that she remained the same size: to be sure, this generally happens when one eats cake, but Alice had got so much into the way of expecting nothing but out-of-the-way things to happen, that it seemed quite dull and stupid for life to go on in the common way. So she set to work, and very soon finished off the cake. * * * * * * * * * * * * * * * * * * * * CHAPTER II. The Pool of Tears 'Curiouser and curiouser!' cried Alice (she was so much surprised, that for the moment she quite forgot how to speak good English); 'now I'm opening out like the largest telescope that ever was! Good-bye, feet!' (for when she looked down at her feet, they seemed to be almost out of sight, they were getting so far off). 'Oh, my poor little feet, I wonder who will put on your shoes and stockings for you now, dears? I'm sure _I_ shan't be able! I shall be a great deal too far off to trouble myself about you: you must manage the best way you can;--but I must be kind to them,' thought Alice, 'or perhaps they won't walk the way I want to go! Let me see: I'll give them a new pair of boots every Christmas.' And she went on planning to herself how she would manage it. 'They must go by the carrier,' she thought; 'and how funny it'll seem, sending presents to one's own feet! And how odd the directions will look! ALICE'S RIGHT FOOT, ESQ. HEARTHRUG, NEAR THE FENDER, (WITH ALICE'S LOVE). Oh dear, what nonsense I'm talking!' Just then her head struck against the roof of the hall: in fact she was now more than nine feet high, and she at once took up the little golden key and hurried off to the garden door. Poor Alice! It was as much as she could do, lying down on one side, to look through into the garden with one eye; but to get through was more hopeless than ever: she sat down and began to cry again. 'You ought to be ashamed of yourself,' said Alice, 'a great girl like you,' (she might well say this), 'to go on crying in this way! Stop this moment, I tell you!' But she went on all the same, shedding gallons of tears, until there was a large pool all round her, about four inches deep and reaching half down the hall. After a time she heard a little pattering of feet in the distance, and she hastily dried her eyes to see what was coming. It was the White Rabbit returning, splendidly dressed, with a pair of white kid gloves in one hand and a large fan in the other: he came trotting along in a great hurry, muttering to himself as he came, 'Oh! the Duchess, the Duchess! Oh! won't she be savage if I've kept her waiting!' Alice felt so desperate that she was ready to ask help of any one; so, when the Rabbit came near her, she began, in a low, timid voice, 'If you please, sir--' The Rabbit started violently, dropped the white kid gloves and the fan, and skurried away into the darkness as hard as he could go. Alice took up the fan and gloves, and, as the hall was very hot, she kept fanning herself all the time she went on talking: 'Dear, dear! How queer everything is to-day! And yesterday things went on just as usual. I wonder if I've been changed in the night? Let me think: was I the same when I got up this morning? I almost think I can remember feeling a little different. But if I'm not the same, the next question is, Who in the world am I? Ah, THAT'S the great puzzle!' And she began thinking over all the children she knew that were of the same age as herself, to see if she could have been changed for any of them. 'I'm sure I'm not Ada,' she said, 'for her hair goes in such long ringlets, and mine doesn't go in ringlets at all; and I'm sure I can't be Mabel, for I know all sorts of things, and she, oh! she knows such a very little! Besides, SHE'S she, and I'm I, and--oh dear, how puzzling it all is! I'll try if I know all the things I used to know. Let me see: four times five is twelve, and four times six is thirteen, and four times seven is--oh dear! I shall never get to twenty at that rate! However, the Multiplication Table doesn't signify: let's try Geography. London is the capital of Paris, and Paris is the capital of Rome, and Rome--no, THAT'S all wrong, I'm certain! I must have been changed for Mabel! I'll try and say "How doth the little--"' and she crossed her hands on her lap as if she were saying lessons, and began to repeat it, but her voice sounded hoarse and strange, and the words did not come the same as they used to do:-- 'How doth the little crocodile Improve his shining tail, And pour the waters of the Nile On every golden scale! 'How cheerfully he seems to grin, How neatly spread his claws, And welcome little fishes in With gently smiling jaws!' 'I'm sure those are not the right words,' said poor Alice, and her eyes filled with tears again as she went on, 'I must be Mabel after all, and I shall have to go and live in that poky little house, and have next to no toys to play with, and oh! ever so many lessons to learn! No, I've made up my mind about it; if I'm Mabel, I'll stay down here! It'll be no use their putting their heads down and saying "Come up again, dear!" I shall only look up and say "Who am I then? Tell me that first, and then, if I like being that person, I'll come up: if not, I'll stay down here till I'm somebody else"--but, oh dear!' cried Alice, with a sudden burst of tears, 'I do wish they WOULD put their heads down! I am so VERY tired of being all alone here!' As she said this she looked down at her hands, and was surprised to see that she had put on one of the Rabbit's little white kid gloves while she was talking. 'How CAN I have done that?' she thought. 'I must be growing small again.' She got up and went to the table to measure herself by it, and found that, as nearly as she could guess, she was now about two feet high, and was going on shrinking rapidly: she soon found out that the cause of this was the fan she was holding, and she dropped it hastily, just in time to avoid shrinking away altogether. 'That WAS a narrow escape!' said Alice, a good deal frightened at the sudden change, but very glad to find herself still in existence; 'and now for the garden!' and she ran with all speed back to the little door: but, alas! the little door was shut again, and the little golden key was lying on the glass table as before, 'and things are worse than ever,' thought the poor child, 'for I never was so small as this before, never! And I declare it's too bad, that it is!' As she said these words her foot slipped, and in another moment, splash! she was up to her chin in salt water. Her first idea was that she had somehow fallen into the sea, 'and in that case I can go back by railway,' she said to herself. (Alice had been to the seaside once in her life, and had come to the general conclusion, that wherever you go to on the English coast you find a number of bathing machines in the sea, some children digging in the sand with wooden spades, then a row of lodging houses, and behind them a railway station.) However, she soon made out that she was in the pool of tears which she had wept when she was nine feet high. 'I wish I hadn't cried so much!' said Alice, as she swam about, trying to find her way out. 'I shall be punished for it now, I suppose, by being drowned in my own tears! That WILL be a queer thing, to be sure! However, everything is queer to-day.' Just then she heard something splashing about in the pool a little way off, and she swam nearer to make out what it was: at first she thought it must be a walrus or hippopotamus, but then she remembered how small she was now, and she soon made out that it was only a mouse that had slipped in like herself. 'Would it be of any use, now,' thought Alice, 'to speak to this mouse? Everything is so out-of-the-way down here, that I should think very likely it can talk: at any rate, there's no harm in trying.' So she began: 'O Mouse, do you know the way out of this pool? I am very tired of swimming about here, O Mouse!' (Alice thought this must be the right way of speaking to a mouse: she had never done such a thing before, but she remembered having seen in her brother's Latin Grammar, 'A mouse--of a mouse--to a mouse--a mouse--O mouse!') The Mouse looked at her rather inquisitively, and seemed to her to wink with one of its little eyes, but it said nothing. 'Perhaps it doesn't understand English,' thought Alice; 'I daresay it's a French mouse, come over with William the Conqueror.' (For, with all her knowledge of history, Alice had no very clear notion how long ago anything had happened.) So she began again: 'Ou est ma chatte?' which was the first sentence in her French lesson-book. The Mouse gave a sudden leap out of the water, and seemed to quiver all over with fright. 'Oh, I beg your pardon!' cried Alice hastily, afraid that she had hurt the poor animal's feelings. 'I quite forgot you didn't like cats.' 'Not like cats!' cried the Mouse, in a shrill, passionate voice. 'Would YOU like cats if you were me?' 'Well, perhaps not,' said Alice in a soothing tone: 'don't be angry about it. And yet I wish I could show you our cat Dinah: I think you'd take a fancy to cats if you could only see her. She is such a dear quiet thing,' Alice went on, half to herself, as she swam lazily about in the pool, 'and she sits purring so nicely by the fire, licking her paws and washing her face--and she is such a nice soft thing to nurse--and she's such a capital one for catching mice--oh, I beg your pardon!' cried Alice again, for this time the Mouse was bristling all over, and she felt certain it must be really offended. 'We won't talk about her any more if you'd rather not.' 'We indeed!' cried the Mouse, who was trembling down to the end of his tail. 'As if I would talk on such a subject! Our family always HATED cats: nasty, low, vulgar things! Don't let me hear the name again!' 'I won't indeed!' said Alice, in a great hurry to change the subject of conversation. 'Are you--are you fond--of--of dogs?' The Mouse did not answer, so Alice went on eagerly: 'There is such a nice little dog near our house I should like to show you! A little bright-eyed terrier, you know, with oh, such long curly brown hair! And it'll fetch things when you throw them, and it'll sit up and beg for its dinner, and all sorts of things--I can't remember half of them--and it belongs to a farmer, you know, and he says it's so useful, it's worth a hundred pounds! He says it kills all the rats and--oh dear!' cried Alice in a sorrowful tone, 'I'm afraid I've offended it again!' For the Mouse was swimming away from her as hard as it could go, and making quite a commotion in the pool as it went. So she called softly after it, 'Mouse dear! Do come back again, and we won't talk about cats or dogs either, if you don't like them!' When the Mouse heard this, it turned round and swam slowly back to her: its face was quite pale (with passion, Alice thought), and it said in a low trembling voice, 'Let us get to the shore, and then I'll tell you my history, and you'll understand why it is I hate cats and dogs.' It was high time to go, for the pool was getting quite crowded with the birds and animals that had fallen into it: there were a Duck and a Dodo, a Lory and an Eaglet, and several other curious creatures. Alice led the way, and the whole party swam to the shore. CHAPTER III. A Caucus-Race and a Long Tale They were indeed a queer-looking party that assembled on the bank--the birds with draggled feathers, the animals with their fur clinging close to them, and all dripping wet, cross, and uncomfortable. The first question of course was, how to get dry again: they had a consultation about this, and after a few minutes it seemed quite natural to Alice to find herself talking familiarly with them, as if she had known them all her life. Indeed, she had quite a long argument with the Lory, who at last turned sulky, and would only say, 'I am older than you, and must know better'; and this Alice would not allow without knowing how old it was, and, as the Lory positively refused to tell its age, there was no more to be said. At last the Mouse, who seemed to be a person of authority among them, called out, 'Sit down, all of you, and listen to me! I'LL soon make you dry enough!' They all sat down at once, in a large ring, with the Mouse in the middle. Alice kept her eyes anxiously fixed on it, for she felt sure she would catch a bad cold if she did not get dry very soon. 'Ahem!' said the Mouse with an important air, 'are you all ready? This is the driest thing I know. Silence all round, if you please! "William the Conqueror, whose cause was favoured by the pope, was soon submitted to by the English, who wanted leaders, and had been of late much accustomed to usurpation and conquest. Edwin and Morcar, the earls of Mercia and Northumbria--"' 'Ugh!' said the Lory, with a shiver. 'I beg your pardon!' said the Mouse, frowning, but very politely: 'Did you speak?' 'Not I!' said the Lory hastily. 'I thought you did,' said the Mouse. '--I proceed. "Edwin and Morcar, the earls of Mercia and Northumbria, declared for him: and even Stigand, the patriotic archbishop of Canterbury, found it advisable--"' 'Found WHAT?' said the Duck. 'Found IT,' the Mouse replied rather crossly: 'of course you know what "it" means.' 'I know what "it" means well enough, when I find a thing,' said the Duck: 'it's generally a frog or a worm. The question is, what did the archbishop find?' The Mouse did not notice this question, but hurriedly went on, '"--found it advisable to go with Edgar Atheling to meet William and offer him the crown. William's conduct at first was moderate. But the insolence of his Normans--" How are you getting on now, my dear?' it continued, turning to Alice as it spoke. 'As wet as ever,' said Alice in a melancholy tone: 'it doesn't seem to dry me at all.' 'In that case,' said the Dodo solemnly, rising to its feet, 'I move that the meeting adjourn, for the immediate adoption of more energetic remedies--' 'Speak English!' said the Eaglet. 'I don't know the meaning of half those long words, and, what's more, I don't believe you do either!' And the Eaglet bent down its head to hide a smile: some of the other birds tittered audibly. 'What I was going to say,' said the Dodo in an offended tone, 'was, that the best thing to get us dry would be a Caucus-race.' 'What IS a Caucus-race?' said Alice; not that she wanted much to know, but the Dodo had paused as if it thought that SOMEBODY ought to speak, and no one else seemed inclined to say anything. 'Why,' said the Dodo, 'the best way to explain it is to do it.' (And, as you might like to try the thing yourself, some winter day, I will tell you how the Dodo managed it.) First it marked out a race-course, in a sort of circle, ('the exact shape doesn't matter,' it said,) and then all the party were placed along the course, here and there. There was no 'One, two, three, and away,' but they began running when they liked, and left off when they liked, so that it was not easy to know when the race was over. However, when they had been running half an hour or so, and were quite dry again, the Dodo suddenly called out 'The race is over!' and they all crowded round it, panting, and asking, 'But who has won?' This question the Dodo could not answer without a great deal of thought, and it sat for a long time with one finger pressed upon its forehead (the position in which you usually see Shakespeare, in the pictures of him), while the rest waited in silence. At last the Dodo said, 'EVERYBODY has won, and all must have prizes.' 'But who is to give the prizes?' quite a chorus of voices asked. 'Why, SHE, of course,' said the Dodo, pointing to Alice with one finger; and the whole party at once crowded round her, calling out in a confused way, 'Prizes! Prizes!' Alice had no idea what to do, and in despair she put her hand in her pocket, and pulled out a box of comfits, (luckily the salt water had not got into it), and handed them round as prizes. There was exactly one a-piece all round. 'But she must have a prize herself, you know,' said the Mouse. 'Of course,' the Dodo replied very gravely. 'What else have you got in your pocket?' he went on, turning to Alice. 'Only a thimble,' said Alice sadly. 'Hand it over here,' said the Dodo. Then they all crowded round her once more, while the Dodo solemnly presented the thimble, saying 'We beg your acceptance of this elegant thimble'; and, when it had finished this short speech, they all cheered. Alice thought the whole thing very absurd, but they all looked so grave that she did not dare to laugh; and, as she could not think of anything to say, she simply bowed, and took the thimble, looking as solemn as she could. The next thing was to eat the comfits: this caused some noise and confusion, as the large birds complained that they could not taste theirs, and the small ones choked and had to be patted on the back. However, it was over at last, and they sat down again in a ring, and begged the Mouse to tell them something more. 'You promised to tell me your history, you know,' said Alice, 'and why it is you hate--C and D,' she added in a whisper, half afraid that it would be offended again. 'Mine is a long and a sad tale!' said the Mouse, turning to Alice, and sighing. 'It IS a long tail, certainly,' said Alice, looking down with wonder at the Mouse's tail; 'but why do you call it sad?' And she kept on puzzling about it while the Mouse was speaking, so that her idea of the tale was something like this:-- 'Fury said to a mouse, That he met in the house, "Let us both go to law: I will prosecute YOU.--Come, I'll take no denial; We must have a trial: For really this morning I've nothing to do." Said the mouse to the cur, "Such a trial, dear Sir, With no jury or judge, would be wasting our breath." "I'll be judge, I'll be jury," Said cunning old Fury: "I'll try the whole cause, and condemn you to death."' 'You are not attending!' said the Mouse to Alice severely. 'What are you thinking of?' 'I beg your pardon,' said Alice very humbly: 'you had got to the fifth bend, I think?' 'I had NOT!' cried the Mouse, sharply and very angrily. 'A knot!' said Alice, always ready to make herself useful, and looking anxiously about her. 'Oh, do let me help to undo it!' 'I shall do nothing of the sort,' said the Mouse, getting up and walking away. 'You insult me by talking such nonsense!' 'I didn't mean it!' pleaded poor Alice. 'But you're so easily offended, you know!' The Mouse only growled in reply. 'Please come back and finish your story!' Alice called after it; and the others all joined in chorus, 'Yes, please do!' but the Mouse only shook its head impatiently, and walked a little quicker. 'What a pity it wouldn't stay!' sighed the Lory, as soon as it was quite out of sight; and an old Crab took the opportunity of saying to her daughter 'Ah, my dear! Let this be a lesson to you never to lose YOUR temper!' 'Hold your tongue, Ma!' said the young Crab, a little snappishly. 'You're enough to try the patience of an oyster!' 'I wish I had our Dinah here, I know I do!' said Alice aloud, addressing nobody in particular. 'She'd soon fetch it back!' 'And who is Dinah, if I might venture to ask the question?' said the Lory. Alice replied eagerly, for she was always ready to talk about her pet: 'Dinah's our cat. And she's such a capital one for catching mice you can't think! And oh, I wish you could see her after the birds! Why, she'll eat a little bird as soon as look at it!' This speech caused a remarkable sensation among the party. Some of the birds hurried off at once: one old Magpie began wrapping itself up very carefully, remarking, 'I really must be getting home; the night-air doesn't suit my throat!' and a Canary called out in a trembling voice to its children, 'Come away, my dears! It's high time you were all in bed!' On various pretexts they all moved off, and Alice was soon left alone. 'I wish I hadn't mentioned Dinah!' she said to herself in a melancholy tone. 'Nobody seems to like her, down here, and I'm sure she's the best cat in the world! Oh, my dear Dinah! I wonder if I shall ever see you any more!' And here poor Alice began to cry again, for she felt very lonely and low-spirited. In a little while, however, she again heard a little pattering of footsteps in the distance, and she looked up eagerly, half hoping that the Mouse had changed his mind, and was coming back to finish his story. CHAPTER IV. The Rabbit Sends in a Little Bill It was the White Rabbit, trotting slowly back again, and looking anxiously about as it went, as if it had lost something; and she heard it muttering to itself 'The Duchess! The Duchess! Oh my dear paws! Oh my fur and whiskers! She'll get me executed, as sure as ferrets are ferrets! Where CAN I have dropped them, I wonder?' Alice guessed in a moment that it was looking for the fan and the pair of white kid gloves, and she very good-naturedly began hunting about for them, but they were nowhere to be seen--everything seemed to have changed since her swim in the pool, and the great hall, with the glass table and the little door, had vanished completely. Very soon the Rabbit noticed Alice, as she went hunting about, and called out to her in an angry tone, 'Why, Mary Ann, what ARE you doing out here? Run home this moment, and fetch me a pair of gloves and a fan! Quick, now!' And Alice was so much frightened that she ran off at once in the direction it pointed to, without trying to explain the mistake it had made. 'He took me for his housemaid,' she said to herself as she ran. 'How surprised he'll be when he finds out who I am! But I'd better take him his fan and gloves--that is, if I can find them.' As she said this, she came upon a neat little house, on the door of which was a bright brass plate with the name 'W. RABBIT' engraved upon it. She went in without knocking, and hurried upstairs, in great fear lest she should meet the real Mary Ann, and be turned out of the house before she had found the fan and gloves. 'How queer it seems,' Alice said to herself, 'to be going messages for a rabbit! I suppose Dinah'll be sending me on messages next!' And she began fancying the sort of thing that would happen: '"Miss Alice! Come here directly, and get ready for your walk!" "Coming in a minute, nurse! But I've got to see that the mouse doesn't get out." Only I don't think,' Alice went on, 'that they'd let Dinah stop in the house if it began ordering people about like that!' By this time she had found her way into a tidy little room with a table in the window, and on it (as she had hoped) a fan and two or three pairs of tiny white kid gloves: she took up the fan and a pair of the gloves, and was just going to leave the room, when her eye fell upon a little bottle that stood near the looking-glass. There was no label this time with the words 'DRINK ME,' but nevertheless she uncorked it and put it to her lips. 'I know SOMETHING interesting is sure to happen,' she said to herself, 'whenever I eat or drink anything; so I'll just see what this bottle does. I do hope it'll make me grow large again, for really I'm quite tired of being such a tiny little thing!' It did so indeed, and much sooner than she had expected: before she had drunk half the bottle, she found her head pressing against the ceiling, and had to stoop to save her neck from being broken. She hastily put down the bottle, saying to herself 'That's quite enough--I hope I shan't grow any more--As it is, I can't get out at the door--I do wish I hadn't drunk quite so much!' Alas! it was too late to wish that! She went on growing, and growing, and very soon had to kneel down on the floor: in another minute there was not even room for this, and she tried the effect of lying down with one elbow against the door, and the other arm curled round her head. Still she went on growing, and, as a last resource, she put one arm out of the window, and one foot up the chimney, and said to herself 'Now I can do no more, whatever happens. What WILL become of me?' Luckily for Alice, the little magic bottle had now had its full effect, and she grew no larger: still it was very uncomfortable, and, as there seemed to be no sort of chance of her ever getting out of the room again, no wonder she felt unhappy. 'It was much pleasanter at home,' thought poor Alice, 'when one wasn't always growing larger and smaller, and being ordered about by mice and rabbits. I almost wish I hadn't gone down that rabbit-hole--and yet--and yet--it's rather curious, you know, this sort of life! I do wonder what CAN have happened to me! When I used to read fairy-tales, I fancied that kind of thing never happened, and now here I am in the middle of one! There ought to be a book written about me, that there ought! And when I grow up, I'll write one--but I'm grown up now,' she added in a sorrowful tone; 'at least there's no room to grow up any more HERE.' 'But then,' thought Alice, 'shall I NEVER get any older than I am now? That'll be a comfort, one way--never to be an old woman--but then--always to have lessons to learn! Oh, I shouldn't like THAT!' 'Oh, you foolish Alice!' she answered herself. 'How can you learn lessons in here? Why, there's hardly room for YOU, and no room at all for any lesson-books!' And so she went on, taking first one side and then the other, and making quite a conversation of it altogether; but after a few minutes she heard a voice outside, and stopped to listen. 'Mary Ann! Mary Ann!' said the voice. 'Fetch me my gloves this moment!' Then came a little pattering of feet on the stairs. Alice knew it was the Rabbit coming to look for her, and she trembled till she shook the house, quite forgetting that she was now about a thousand times as large as the Rabbit, and had no reason to be afraid of it. Presently the Rabbit came up to the door, and tried to open it; but, as the door opened inwards, and Alice's elbow was pressed hard against it, that attempt proved a failure. Alice heard it say to itself 'Then I'll go round and get in at the window.' 'THAT you won't' thought Alice, and, after waiting till she fancied she heard the Rabbit just under the window, she suddenly spread out her hand, and made a snatch in the air. She did not get hold of anything, but she heard a little shriek and a fall, and a crash of broken glass, from which she concluded that it was just possible it had fallen into a cucumber-frame, or something of the sort. Next came an angry voice--the Rabbit's--'Pat! Pat! Where are you?' And then a voice she had never heard before, 'Sure then I'm here! Digging for apples, yer honour!' 'Digging for apples, indeed!' said the Rabbit angrily. 'Here! Come and help me out of THIS!' (Sounds of more broken glass.) 'Now tell me, Pat, what's that in the window?' 'Sure, it's an arm, yer honour!' (He pronounced it 'arrum.') 'An arm, you goose! Who ever saw one that size? Why, it fills the whole window!' 'Sure, it does, yer honour: but it's an arm for all that.' 'Well, it's got no business there, at any rate: go and take it away!' There was a long silence after this, and Alice could only hear whispers now and then; such as, 'Sure, I don't like it, yer honour, at all, at all!' 'Do as I tell you, you coward!' and at last she spread out her hand again, and made another snatch in the air. This time there were TWO little shrieks, and more sounds of broken glass. 'What a number of cucumber-frames there must be!' thought Alice. 'I wonder what they'll do next! As for pulling me out of the window, I only wish they COULD! I'm sure I don't want to stay in here any longer!' She waited for some time without hearing anything more: at last came a rumbling of little cartwheels, and the sound of a good many voices all talking together: she made out the words: 'Where's the other ladder?--Why, I hadn't to bring but one; Bill's got the other--Bill! fetch it here, lad!--Here, put 'em up at this corner--No, tie 'em together first--they don't reach half high enough yet--Oh! they'll do well enough; don't be particular--Here, Bill! catch hold of this rope--Will the roof bear?--Mind that loose slate--Oh, it's coming down! Heads below!' (a loud crash)--'Now, who did that?--It was Bill, I fancy--Who's to go down the chimney?--Nay, I shan't! YOU do it!--That I won't, then!--Bill's to go down--Here, Bill! the master says you're to go down the chimney!' 'Oh! So Bill's got to come down the chimney, has he?' said Alice to herself. 'Shy, they seem to put everything upon Bill! I wouldn't be in Bill's place for a good deal: this fireplace is narrow, to be sure; but I THINK I can kick a little!' She drew her foot as far down the chimney as she could, and waited till she heard a little animal (she couldn't guess of what sort it was) scratching and scrambling about in the chimney close above her: then, saying to herself 'This is Bill,' she gave one sharp kick, and waited to see what would happen next. The first thing she heard was a general chorus of 'There goes Bill!' then the Rabbit's voice along--'Catch him, you by the hedge!' then silence, and then another confusion of voices--'Hold up his head--Brandy now--Don't choke him--How was it, old fellow? What happened to you? Tell us all about it!' Last came a little feeble, squeaking voice, ('That's Bill,' thought Alice,) 'Well, I hardly know--No more, thank ye; I'm better now--but I'm a deal too flustered to tell you--all I know is, something comes at me like a Jack-in-the-box, and up I goes like a sky-rocket!' 'So you did, old fellow!' said the others. 'We must burn the house down!' said the Rabbit's voice; and Alice called out as loud as she could, 'If you do. I'll set Dinah at you!' There was a dead silence instantly, and Alice thought to herself, 'I wonder what they WILL do next! If they had any sense, they'd take the roof off.' After a minute or two, they began moving about again, and Alice heard the Rabbit say, 'A barrowful will do, to begin with.' 'A barrowful of WHAT?' thought Alice; but she had not long to doubt, for the next moment a shower of little pebbles came rattling in at the window, and some of them hit her in the face. 'I'll put a stop to this,' she said to herself, and shouted out, 'You'd better not do that again!' which produced another dead silence. Alice noticed with some surprise that the pebbles were all turning into little cakes as they lay on the floor, and a bright idea came into her head. 'If I eat one of these cakes,' she thought, 'it's sure to make SOME change in my size; and as it can't possibly make me larger, it must make me smaller, I suppose.' So she swallowed one of the cakes, and was delighted to find that she began shrinking directly. As soon as she was small enough to get through the door, she ran out of the house, and found quite a crowd of little animals and birds waiting outside. The poor little Lizard, Bill, was in the middle, being held up by two guinea-pigs, who were giving it something out of a bottle. They all made a rush at Alice the moment she appeared; but she ran off as hard as she could, and soon found herself safe in a thick wood. 'The first thing I've got to do,' said Alice to herself, as she wandered about in the wood, 'is to grow to my right size again; and the second thing is to find my way into that lovely garden. I think that will be the best plan.' It sounded an excellent plan, no doubt, and very neatly and simply arranged; the only difficulty was, that she had not the smallest idea how to set about it; and while she was peering about anxiously among the trees, a little sharp bark just over her head made her look up in a great hurry. An enormous puppy was looking down at her with large round eyes, and feebly stretching out one paw, trying to touch her. 'Poor little thing!' said Alice, in a coaxing tone, and she tried hard to whistle to it; but she was terribly frightened all the time at the thought that it might be hungry, in which case it would be very likely to eat her up in spite of all her coaxing. Hardly knowing what she did, she picked up a little bit of stick, and held it out to the puppy; whereupon the puppy jumped into the air off all its feet at once, with a yelp of delight, and rushed at the stick, and made believe to worry it; then Alice dodged behind a great thistle, to keep herself from being run over; and the moment she appeared on the other side, the puppy made another rush at the stick, and tumbled head over heels in its hurry to get hold of it; then Alice, thinking it was very like having a game of play with a cart-horse, and expecting every moment to be trampled under its feet, ran round the thistle again; then the puppy began a series of short charges at the stick, running a very little way forwards each time and a long way back, and barking hoarsely all the while, till at last it sat down a good way off, panting, with its tongue hanging out of its mouth, and its great eyes half shut. This seemed to Alice a good opportunity for making her escape; so she set off at once, and ran till she was quite tired and out of breath, and till the puppy's bark sounded quite faint in the distance. 'And yet what a dear little puppy it was!' said Alice, as she leant against a buttercup to rest herself, and fanned herself with one of the leaves: 'I should have liked teaching it tricks very much, if--if I'd only been the right size to do it! Oh dear! I'd nearly forgotten that I've got to grow up again! Let me see--how IS it to be managed? I suppose I ought to eat or drink something or other; but the great question is, what?' The great question certainly was, what? Alice looked all round her at the flowers and the blades of grass, but she did not see anything that looked like the right thing to eat or drink under the circumstances. There was a large mushroom growing near her, about the same height as herself; and when she had looked under it, and on both sides of it, and behind it, it occurred to her that she might as well look and see what was on the top of it. She stretched herself up on tiptoe, and peeped over the edge of the mushroom, and her eyes immediately met those of a large caterpillar, that was sitting on the top with its arms folded, quietly smoking a long hookah, and taking not the smallest notice of her or of anything else. CHAPTER V. Advice from a Caterpillar The Caterpillar and Alice looked at each other for some time in silence: at last the Caterpillar took the hookah out of its mouth, and addressed her in a languid, sleepy voice. 'Who are YOU?' said the Caterpillar. This was not an encouraging opening for a conversation. Alice replied, rather shyly, 'I--I hardly know, sir, just at present--at least I know who I WAS when I got up this morning, but I think I must have been changed several times since then.' 'What do you mean by that?' said the Caterpillar sternly. 'Explain yourself!' 'I can't explain MYSELF, I'm afraid, sir' said Alice, 'because I'm not myself, you see.' 'I don't see,' said the Caterpillar. 'I'm afraid I can't put it more clearly,' Alice replied very politely, 'for I can't understand it myself to begin with; and being so many different sizes in a day is very confusing.' 'It isn't,' said the Caterpillar. 'Well, perhaps you haven't found it so yet,' said Alice; 'but when you have to turn into a chrysalis--you will some day, you know--and then after that into a butterfly, I should think you'll feel it a little queer, won't you?' 'Not a bit,' said the Caterpillar. 'Well, perhaps your feelings may be different,' said Alice; 'all I know is, it would feel very queer to ME.' 'You!' said the Caterpillar contemptuously. 'Who are YOU?' Which brought them back again to the beginning of the conversation. Alice felt a little irritated at the Caterpillar's making such VERY short remarks, and she drew herself up and said, very gravely, 'I think, you ought to tell me who YOU are, first.' 'Why?' said the Caterpillar. Here was another puzzling question; and as Alice could not think of any good reason, and as the Caterpillar seemed to be in a VERY unpleasant state of mind, she turned away. 'Come back!' the Caterpillar called after her. 'I've something important to say!' This sounded promising, certainly: Alice turned and came back again. 'Keep your temper,' said the Caterpillar. 'Is that all?' said Alice, swallowing down her anger as well as she could. 'No,' said the Caterpillar. Alice thought she might as well wait, as she had nothing else to do, and perhaps after all it might tell her something worth hearing. For some minutes it puffed away without speaking, but at last it unfolded its arms, took the hookah out of its mouth again, and said, 'So you think you're changed, do you?' 'I'm afraid I am, sir,' said Alice; 'I can't remember things as I used--and I don't keep the same size for ten minutes together!' 'Can't remember WHAT things?' said the Caterpillar. 'Well, I've tried to say "HOW DOTH THE LITTLE BUSY BEE," but it all came different!' Alice replied in a very melancholy voice. 'Repeat, "YOU ARE OLD, FATHER WILLIAM,"' said the Caterpillar. Alice folded her hands, and began:-- 'You are old, Father William,' the young man said, 'And your hair has become very white; And yet you incessantly stand on your head-- Do you think, at your age, it is right?' 'In my youth,' Father William replied to his son, 'I feared it might injure the brain; But, now that I'm perfectly sure I have none, Why, I do it again and again.' 'You are old,' said the youth, 'as I mentioned before, And have grown most uncommonly fat; Yet you turned a back-somersault in at the door-- Pray, what is the reason of that?' 'In my youth,' said the sage, as he shook his grey locks, 'I kept all my limbs very supple By the use of this ointment--one shilling the box-- Allow me to sell you a couple?' 'You are old,' said the youth, 'and your jaws are too weak For anything tougher than suet; Yet you finished the goose, with the bones and the beak-- Pray how did you manage to do it?' 'In my youth,' said his father, 'I took to the law, And argued each case with my wife; And the muscular strength, which it gave to my jaw, Has lasted the rest of my life.' 'You are old,' said the youth, 'one would hardly suppose That your eye was as steady as ever; Yet you balanced an eel on the end of your nose-- What made you so awfully clever?' 'I have answered three questions, and that is enough,' Said his father; 'don't give yourself airs! Do you think I can listen all day to such stuff? Be off, or I'll kick you down stairs!' 'That is not said right,' said the Caterpillar. 'Not QUITE right, I'm afraid,' said Alice, timidly; 'some of the words have got altered.' 'It is wrong from beginning to end,' said the Caterpillar decidedly, and there was silence for some minutes. The Caterpillar was the first to speak. 'What size do you want to be?' it asked. 'Oh, I'm not particular as to size,' Alice hastily replied; 'only one doesn't like changing so often, you know.' 'I DON'T know,' said the Caterpillar. Alice said nothing: she had never been so much contradicted in her life before, and she felt that she was losing her temper. 'Are you content now?' said the Caterpillar. 'Well, I should like to be a LITTLE larger, sir, if you wouldn't mind,' said Alice: 'three inches is such a wretched height to be.' 'It is a very good height indeed!' said the Caterpillar angrily, rearing itself upright as it spoke (it was exactly three inches high). 'But I'm not used to it!' pleaded poor Alice in a piteous tone. And she thought of herself, 'I wish the creatures wouldn't be so easily offended!' 'You'll get used to it in time,' said the Caterpillar; and it put the hookah into its mouth and began smoking again. This time Alice waited patiently until it chose to speak again. In a minute or two the Caterpillar took the hookah out of its mouth and yawned once or twice, and shook itself. Then it got down off the mushroom, and crawled away in the grass, merely remarking as it went, 'One side will make you grow taller, and the other side will make you grow shorter.' 'One side of WHAT? The other side of WHAT?' thought Alice to herself. 'Of the mushroom,' said the Caterpillar, just as if she had asked it aloud; and in another moment it was out of sight. Alice remained looking thoughtfully at the mushroom for a minute, trying to make out which were the two sides of it; and as it was perfectly round, she found this a very difficult question. However, at last she stretched her arms round it as far as they would go, and broke off a bit of the edge with each hand. 'And now which is which?' she said to herself, and nibbled a little of the right-hand bit to try the effect: the next moment she felt a violent blow underneath her chin: it had struck her foot! She was a good deal frightened by this very sudden change, but she felt that there was no time to be lost, as she was shrinking rapidly; so she set to work at once to eat some of the other bit. Her chin was pressed so closely against her foot, that there was hardly room to open her mouth; but she did it at last, and managed to swallow a morsel of the lefthand bit. * * * * * * * * * * * * * * * * * * * * 'Come, my head's free at last!' said Alice in a tone of delight, which changed into alarm in another moment, when she found that her shoulders were nowhere to be found: all she could see, when she looked down, was an immense length of neck, which seemed to rise like a stalk out of a sea of green leaves that lay far below her. 'What CAN all that green stuff be?' said Alice. 'And where HAVE my shoulders got to? And oh, my poor hands, how is it I can't see you?' She was moving them about as she spoke, but no result seemed to follow, except a little shaking among the distant green leaves. As there seemed to be no chance of getting her hands up to her head, she tried to get her head down to them, and was delighted to find that her neck would bend about easily in any direction, like a serpent. She had just succeeded in curving it down into a graceful zigzag, and was going to dive in among the leaves, which she found to be nothing but the tops of the trees under which she had been wandering, when a sharp hiss made her draw back in a hurry: a large pigeon had flown into her face, and was beating her violently with its wings. 'Serpent!' screamed the Pigeon. 'I'm NOT a serpent!' said Alice indignantly. 'Let me alone!' 'Serpent, I say again!' repeated the Pigeon, but in a more subdued tone, and added with a kind of sob, 'I've tried every way, and nothing seems to suit them!' 'I haven't the least idea what you're talking about,' said Alice. 'I've tried the roots of trees, and I've tried banks, and I've tried hedges,' the Pigeon went on, without attending to her; 'but those serpents! There's no pleasing them!' Alice was more and more puzzled, but she thought there was no use in saying anything more till the Pigeon had finished. 'As if it wasn't trouble enough hatching the eggs,' said the Pigeon; 'but I must be on the look-out for serpents night and day! Why, I haven't had a wink of sleep these three weeks!' 'I'm very sorry you've been annoyed,' said Alice, who was beginning to see its meaning. 'And just as I'd taken the highest tree in the wood,' continued the Pigeon, raising its voice to a shriek, 'and just as I was thinking I should be free of them at last, they must needs come wriggling down from the sky! Ugh, Serpent!' 'But I'm NOT a serpent, I tell you!' said Alice. 'I'm a--I'm a--' 'Well! WHAT are you?' said the Pigeon. 'I can see you're trying to invent something!' 'I--I'm a little girl,' said Alice, rather doubtfully, as she remembered the number of changes she had gone through that day. 'A likely story indeed!' said the Pigeon in a tone of the deepest contempt. 'I've seen a good many little girls in my time, but never ONE with such a neck as that! No, no! You're a serpent; and there's no use denying it. I suppose you'll be telling me next that you never tasted an egg!' 'I HAVE tasted eggs, certainly,' said Alice, who was a very truthful child; 'but little girls eat eggs quite as much as serpents do, you know.' 'I don't believe it,' said the Pigeon; 'but if they do, why then they're a kind of serpent, that's all I can say.' This was such a new idea to Alice, that she was quite silent for a minute or two, which gave the Pigeon the opportunity of adding, 'You're looking for eggs, I know THAT well enough; and what does it matter to me whether you're a little girl or a serpent?' 'It matters a good deal to ME,' said Alice hastily; 'but I'm not looking for eggs, as it happens; and if I was, I shouldn't want YOURS: I don't like them raw.' 'Well, be off, then!' said the Pigeon in a sulky tone, as it settled down again into its nest. Alice crouched down among the trees as well as she could, for her neck kept getting entangled among the branches, and every now and then she had to stop and untwist it. After a while she remembered that she still held the pieces of mushroom in her hands, and she set to work very carefully, nibbling first at one and then at the other, and growing sometimes taller and sometimes shorter, until she had succeeded in bringing herself down to her usual height. It was so long since she had been anything near the right size, that it felt quite strange at first; but she got used to it in a few minutes, and began talking to herself, as usual. 'Come, there's half my plan done now! How puzzling all these changes are! I'm never sure what I'm going to be, from one minute to another! However, I've got back to my right size: the next thing is, to get into that beautiful garden--how IS that to be done, I wonder?' As she said this, she came suddenly upon an open place, with a little house in it about four feet high. 'Whoever lives there,' thought Alice, 'it'll never do to come upon them THIS size: why, I should frighten them out of their wits!' So she began nibbling at the righthand bit again, and did not venture to go near the house till she had brought herself down to nine inches high. CHAPTER VI. Pig and Pepper For a minute or two she stood looking at the house, and wondering what to do next, when suddenly a footman in livery came running out of the wood--(she considered him to be a footman because he was in livery: otherwise, judging by his face only, she would have called him a fish)--and rapped loudly at the door with his knuckles. It was opened by another footman in livery, with a round face, and large eyes like a frog; and both footmen, Alice noticed, had powdered hair that curled all over their heads. She felt very curious to know what it was all about, and crept a little way out of the wood to listen. The Fish-Footman began by producing from under his arm a great letter, nearly as large as himself, and this he handed over to the other, saying, in a solemn tone, 'For the Duchess. An invitation from the Queen to play croquet.' The Frog-Footman repeated, in the same solemn tone, only changing the order of the words a little, 'From the Queen. An invitation for the Duchess to play croquet.' Then they both bowed low, and their curls got entangled together. Alice laughed so much at this, that she had to run back into the wood for fear of their hearing her; and when she next peeped out the Fish-Footman was gone, and the other was sitting on the ground near the door, staring stupidly up into the sky. Alice went timidly up to the door, and knocked. 'There's no sort of use in knocking,' said the Footman, 'and that for two reasons. First, because I'm on the same side of the door as you are; secondly, because they're making such a noise inside, no one could possibly hear you.' And certainly there was a most extraordinary noise going on within--a constant howling and sneezing, and every now and then a great crash, as if a dish or kettle had been broken to pieces. 'Please, then,' said Alice, 'how am I to get in?' 'There might be some sense in your knocking,' the Footman went on without attending to her, 'if we had the door between us. For instance, if you were INSIDE, you might knock, and I could let you out, you know.' He was looking up into the sky all the time he was speaking, and this Alice thought decidedly uncivil. 'But perhaps he can't help it,' she said to herself; 'his eyes are so VERY nearly at the top of his head. But at any rate he might answer questions.--How am I to get in?' she repeated, aloud. 'I shall sit here,' the Footman remarked, 'till tomorrow--' At this moment the door of the house opened, and a large plate came skimming out, straight at the Footman's head: it just grazed his nose, and broke to pieces against one of the trees behind him. '--or next day, maybe,' the Footman continued in the same tone, exactly as if nothing had happened. 'How am I to get in?' asked Alice again, in a louder tone. 'ARE you to get in at all?' said the Footman. 'That's the first question, you know.' It was, no doubt: only Alice did not like to be told so. 'It's really dreadful,' she muttered to herself, 'the way all the creatures argue. It's enough to drive one crazy!' The Footman seemed to think this a good opportunity for repeating his remark, with variations. 'I shall sit here,' he said, 'on and off, for days and days.' 'But what am I to do?' said Alice. 'Anything you like,' said the Footman, and began whistling. 'Oh, there's no use in talking to him,' said Alice desperately: 'he's perfectly idiotic!' And she opened the door and went in. The door led right into a large kitchen, which was full of smoke from one end to the other: the Duchess was sitting on a three-legged stool in the middle, nursing a baby; the cook was leaning over the fire, stirring a large cauldron which seemed to be full of soup. 'There's certainly too much pepper in that soup!' Alice said to herself, as well as she could for sneezing. There was certainly too much of it in the air. Even the Duchess sneezed occasionally; and as for the baby, it was sneezing and howling alternately without a moment's pause. The only things in the kitchen that did not sneeze, were the cook, and a large cat which was sitting on the hearth and grinning from ear to ear. 'Please would you tell me,' said Alice, a little timidly, for she was not quite sure whether it was good manners for her to speak first, 'why your cat grins like that?' 'It's a Cheshire cat,' said the Duchess, 'and that's why. Pig!' She said the last word with such sudden violence that Alice quite jumped; but she saw in another moment that it was addressed to the baby, and not to her, so she took courage, and went on again:-- 'I didn't know that Cheshire cats always grinned; in fact, I didn't know that cats COULD grin.' 'They all can,' said the Duchess; 'and most of 'em do.' 'I don't know of any that do,' Alice said very politely, feeling quite pleased to have got into a conversation. 'You don't know much,' said the Duchess; 'and that's a fact.' Alice did not at all like the tone of this remark, and thought it would be as well to introduce some other subject of conversation. While she was trying to fix on one, the cook took the cauldron of soup off the fire, and at once set to work throwing everything within her reach at the Duchess and the baby--the fire-irons came first; then followed a shower of saucepans, plates, and dishes. The Duchess took no notice of them even when they hit her; and the baby was howling so much already, that it was quite impossible to say whether the blows hurt it or not. 'Oh, PLEASE mind what you're doing!' cried Alice, jumping up and down in an agony of terror. 'Oh, there goes his PRECIOUS nose'; as an unusually large saucepan flew close by it, and very nearly carried it off. 'If everybody minded their own business,' the Duchess said in a hoarse growl, 'the world would go round a deal faster than it does.' 'Which would NOT be an advantage,' said Alice, who felt very glad to get an opportunity of showing off a little of her knowledge. 'Just think of what work it would make with the day and night! You see the earth takes twenty-four hours to turn round on its axis--' 'Talking of axes,' said the Duchess, 'chop off her head!' Alice glanced rather anxiously at the cook, to see if she meant to take the hint; but the cook was busily stirring the soup, and seemed not to be listening, so she went on again: 'Twenty-four hours, I THINK; or is it twelve? I--' 'Oh, don't bother ME,' said the Duchess; 'I never could abide figures!' And with that she began nursing her child again, singing a sort of lullaby to it as she did so, and giving it a violent shake at the end of every line: 'Speak roughly to your little boy, And beat him when he sneezes: He only does it to annoy, Because he knows it teases.' CHORUS. (In which the cook and the baby joined):-- 'Wow! wow! wow!' While the Duchess sang the second verse of the song, she kept tossing the baby violently up and down, and the poor little thing howled so, that Alice could hardly hear the words:-- 'I speak severely to my boy, I beat him when he sneezes; For he can thoroughly enjoy The pepper when he pleases!' CHORUS. 'Wow! wow! wow!' 'Here! you may nurse it a bit, if you like!' the Duchess said to Alice, flinging the baby at her as she spoke. 'I must go and get ready to play croquet with the Queen,' and she hurried out of the room. The cook threw a frying-pan after her as she went out, but it just missed her. Alice caught the baby with some difficulty, as it was a queer-shaped little creature, and held out its arms and legs in all directions, 'just like a star-fish,' thought Alice. The poor little thing was snorting like a steam-engine when she caught it, and kept doubling itself up and straightening itself out again, so that altogether, for the first minute or two, it was as much as she could do to hold it. As soon as she had made out the proper way of nursing it, (which was to twist it up into a sort of knot, and then keep tight hold of its right ear and left foot, so as to prevent its undoing itself,) she carried it out into the open air. 'IF I don't take this child away with me,' thought Alice, 'they're sure to kill it in a day or two: wouldn't it be murder to leave it behind?' She said the last words out loud, and the little thing grunted in reply (it had left off sneezing by this time). 'Don't grunt,' said Alice; 'that's not at all a proper way of expressing yourself.' The baby grunted again, and Alice looked very anxiously into its face to see what was the matter with it. There could be no doubt that it had a VERY turn-up nose, much more like a snout than a real nose; also its eyes were getting extremely small for a baby: altogether Alice did not like the look of the thing at all. 'But perhaps it was only sobbing,' she thought, and looked into its eyes again, to see if there were any tears. No, there were no tears. 'If you're going to turn into a pig, my dear,' said Alice, seriously, 'I'll have nothing more to do with you. Mind now!' The poor little thing sobbed again (or grunted, it was impossible to say which), and they went on for some while in silence. Alice was just beginning to think to herself, 'Now, what am I to do with this creature when I get it home?' when it grunted again, so violently, that she looked down into its face in some alarm. This time there could be NO mistake about it: it was neither more nor less than a pig, and she felt that it would be quite absurd for her to carry it further. So she set the little creature down, and felt quite relieved to see it trot away quietly into the wood. 'If it had grown up,' she said to herself, 'it would have made a dreadfully ugly child: but it makes rather a handsome pig, I think.' And she began thinking over other children she knew, who might do very well as pigs, and was just saying to herself, 'if one only knew the right way to change them--' when she was a little startled by seeing the Cheshire Cat sitting on a bough of a tree a few yards off. The Cat only grinned when it saw Alice. It looked good-natured, she thought: still it had VERY long claws and a great many teeth, so she felt that it ought to be treated with respect. 'Cheshire Puss,' she began, rather timidly, as she did not at all know whether it would like the name: however, it only grinned a little wider. 'Come, it's pleased so far,' thought Alice, and she went on. 'Would you tell me, please, which way I ought to go from here?' 'That depends a good deal on where you want to get to,' said the Cat. 'I don't much care where--' said Alice. 'Then it doesn't matter which way you go,' said the Cat. '--so long as I get SOMEWHERE,' Alice added as an explanation. 'Oh, you're sure to do that,' said the Cat, 'if you only walk long enough.' Alice felt that this could not be denied, so she tried another question. 'What sort of people live about here?' 'In THAT direction,' the Cat said, waving its right paw round, 'lives a Hatter: and in THAT direction,' waving the other paw, 'lives a March Hare. Visit either you like: they're both mad.' 'But I don't want to go among mad people,' Alice remarked. 'Oh, you can't help that,' said the Cat: 'we're all mad here. I'm mad. You're mad.' 'How do you know I'm mad?' said Alice. 'You must be,' said the Cat, 'or you wouldn't have come here.' Alice didn't think that proved it at all; however, she went on 'And how do you know that you're mad?' 'To begin with,' said the Cat, 'a dog's not mad. You grant that?' 'I suppose so,' said Alice. 'Well, then,' the Cat went on, 'you see, a dog growls when it's angry, and wags its tail when it's pleased. Now I growl when I'm pleased, and wag my tail when I'm angry. Therefore I'm mad.' 'I call it purring, not growling,' said Alice. 'Call it what you like,' said the Cat. 'Do you play croquet with the Queen to-day?' 'I should like it very much,' said Alice, 'but I haven't been invited yet.' 'You'll see me there,' said the Cat, and vanished. Alice was not much surprised at this, she was getting so used to queer things happening. While she was looking at the place where it had been, it suddenly appeared again. 'By-the-bye, what became of the baby?' said the Cat. 'I'd nearly forgotten to ask.' 'It turned into a pig,' Alice quietly said, just as if it had come back in a natural way. 'I thought it would,' said the Cat, and vanished again. Alice waited a little, half expecting to see it again, but it did not appear, and after a minute or two she walked on in the direction in which the March Hare was said to live. 'I've seen hatters before,' she said to herself; 'the March Hare will be much the most interesting, and perhaps as this is May it won't be raving mad--at least not so mad as it was in March.' As she said this, she looked up, and there was the Cat again, sitting on a branch of a tree. 'Did you say pig, or fig?' said the Cat. 'I said pig,' replied Alice; 'and I wish you wouldn't keep appearing and vanishing so suddenly: you make one quite giddy.' 'All right,' said the Cat; and this time it vanished quite slowly, beginning with the end of the tail, and ending with the grin, which remained some time after the rest of it had gone. 'Well! I've often seen a cat without a grin,' thought Alice; 'but a grin without a cat! It's the most curious thing I ever saw in my life!' She had not gone much farther before she came in sight of the house of the March Hare: she thought it must be the right house, because the chimneys were shaped like ears and the roof was thatched with fur. It was so large a house, that she did not like to go nearer till she had nibbled some more of the lefthand bit of mushroom, and raised herself to about two feet high: even then she walked up towards it rather timidly, saying to herself 'Suppose it should be raving mad after all! I almost wish I'd gone to see the Hatter instead!' CHAPTER VII. A Mad Tea-Party There was a table set out under a tree in front of the house, and the March Hare and the Hatter were having tea at it: a Dormouse was sitting between them, fast asleep, and the other two were using it as a cushion, resting their elbows on it, and talking over its head. 'Very uncomfortable for the Dormouse,' thought Alice; 'only, as it's asleep, I suppose it doesn't mind.' The table was a large one, but the three were all crowded together at one corner of it: 'No room! No room!' they cried out when they saw Alice coming. 'There's PLENTY of room!' said Alice indignantly, and she sat down in a large arm-chair at one end of the table. 'Have some wine,' the March Hare said in an encouraging tone. Alice looked all round the table, but there was nothing on it but tea. 'I don't see any wine,' she remarked. 'There isn't any,' said the March Hare. 'Then it wasn't very civil of you to offer it,' said Alice angrily. 'It wasn't very civil of you to sit down without being invited,' said the March Hare. 'I didn't know it was YOUR table,' said Alice; 'it's laid for a great many more than three.' 'Your hair wants cutting,' said the Hatter. He had been looking at Alice for some time with great curiosity, and this was his first speech. 'You should learn not to make personal remarks,' Alice said with some severity; 'it's very rude.' The Hatter opened his eyes very wide on hearing this; but all he SAID was, 'Why is a raven like a writing-desk?' 'Come, we shall have some fun now!' thought Alice. 'I'm glad they've begun asking riddles.--I believe I can guess that,' she added aloud. 'Do you mean that you think you can find out the answer to it?' said the March Hare. 'Exactly so,' said Alice. 'Then you should say what you mean,' the March Hare went on. 'I do,' Alice hastily replied; 'at least--at least I mean what I say--that's the same thing, you know.' 'Not the same thing a bit!' said the Hatter. 'You might just as well say that "I see what I eat" is the same thing as "I eat what I see"!' 'You might just as well say,' added the March Hare, 'that "I like what I get" is the same thing as "I get what I like"!' 'You might just as well say,' added the Dormouse, who seemed to be talking in his sleep, 'that "I breathe when I sleep" is the same thing as "I sleep when I breathe"!' 'It IS the same thing with you,' said the Hatter, and here the conversation dropped, and the party sat silent for a minute, while Alice thought over all she could remember about ravens and writing-desks, which wasn't much. The Hatter was the first to break the silence. 'What day of the month is it?' he said, turning to Alice: he had taken his watch out of his pocket, and was looking at it uneasily, shaking it every now and then, and holding it to his ear. Alice considered a little, and then said 'The fourth.' 'Two days wrong!' sighed the Hatter. 'I told you butter wouldn't suit the works!' he added looking angrily at the March Hare. 'It was the BEST butter,' the March Hare meekly replied. 'Yes, but some crumbs must have got in as well,' the Hatter grumbled: 'you shouldn't have put it in with the bread-knife.' The March Hare took the watch and looked at it gloomily: then he dipped it into his cup of tea, and looked at it again: but he could think of nothing better to say than his first remark, 'It was the BEST butter, you know.' Alice had been looking over his shoulder with some curiosity. 'What a funny watch!' she remarked. 'It tells the day of the month, and doesn't tell what o'clock it is!' 'Why should it?' muttered the Hatter. 'Does YOUR watch tell you what year it is?' 'Of course not,' Alice replied very readily: 'but that's because it stays the same year for such a long time together.' 'Which is just the case with MINE,' said the Hatter. Alice felt dreadfully puzzled. The Hatter's remark seemed to have no sort of meaning in it, and yet it was certainly English. 'I don't quite understand you,' she said, as politely as she could. 'The Dormouse is asleep again,' said the Hatter, and he poured a little hot tea upon its nose. The Dormouse shook its head impatiently, and said, without opening its eyes, 'Of course, of course; just what I was going to remark myself.' 'Have you guessed the riddle yet?' the Hatter said, turning to Alice again. 'No, I give it up,' Alice replied: 'what's the answer?' 'I haven't the slightest idea,' said the Hatter. 'Nor I,' said the March Hare. Alice sighed wearily. 'I think you might do something better with the time,' she said, 'than waste it in asking riddles that have no answers.' 'If you knew Time as well as I do,' said the Hatter, 'you wouldn't talk about wasting IT. It's HIM.' 'I don't know what you mean,' said Alice. 'Of course you don't!' the Hatter said, tossing his head contemptuously. 'I dare say you never even spoke to Time!' 'Perhaps not,' Alice cautiously replied: 'but I know I have to beat time when I learn music.' 'Ah! that accounts for it,' said the Hatter. 'He won't stand beating. Now, if you only kept on good terms with him, he'd do almost anything you liked with the clock. For instance, suppose it were nine o'clock in the morning, just time to begin lessons: you'd only have to whisper a hint to Time, and round goes the clock in a twinkling! Half-past one, time for dinner!' ('I only wish it was,' the March Hare said to itself in a whisper.) 'That would be grand, certainly,' said Alice thoughtfully: 'but then--I shouldn't be hungry for it, you know.' 'Not at first, perhaps,' said the Hatter: 'but you could keep it to half-past one as long as you liked.' 'Is that the way YOU manage?' Alice asked. The Hatter shook his head mournfully. 'Not I!' he replied. 'We quarrelled last March--just before HE went mad, you know--' (pointing with his tea spoon at the March Hare,) '--it was at the great concert given by the Queen of Hearts, and I had to sing "Twinkle, twinkle, little bat! How I wonder what you're at!" You know the song, perhaps?' 'I've heard something like it,' said Alice. 'It goes on, you know,' the Hatter continued, 'in this way:-- "Up above the world you fly, Like a tea-tray in the sky. Twinkle, twinkle--"' Here the Dormouse shook itself, and began singing in its sleep 'Twinkle, twinkle, twinkle, twinkle--' and went on so long that they had to pinch it to make it stop. 'Well, I'd hardly finished the first verse,' said the Hatter, 'when the Queen jumped up and bawled out, "He's murdering the time! Off with his head!"' 'How dreadfully savage!' exclaimed Alice. 'And ever since that,' the Hatter went on in a mournful tone, 'he won't do a thing I ask! It's always six o'clock now.' A bright idea came into Alice's head. 'Is that the reason so many tea-things are put out here?' she asked. 'Yes, that's it,' said the Hatter with a sigh: 'it's always tea-time, and we've no time to wash the things between whiles.' 'Then you keep moving round, I suppose?' said Alice. 'Exactly so,' said the Hatter: 'as the things get used up.' 'But what happens when you come to the beginning again?' Alice ventured to ask. 'Suppose we change the subject,' the March Hare interrupted, yawning. 'I'm getting tired of this. I vote the young lady tells us a story.' 'I'm afraid I don't know one,' said Alice, rather alarmed at the proposal. 'Then the Dormouse shall!' they both cried. 'Wake up, Dormouse!' And they pinched it on both sides at once. The Dormouse slowly opened his eyes. 'I wasn't asleep,' he said in a hoarse, feeble voice: 'I heard every word you fellows were saying.' 'Tell us a story!' said the March Hare. 'Yes, please do!' pleaded Alice. 'And be quick about it,' added the Hatter, 'or you'll be asleep again before it's done.' 'Once upon a time there were three little sisters,' the Dormouse began in a great hurry; 'and their names were Elsie, Lacie, and Tillie; and they lived at the bottom of a well--' 'What did they live on?' said Alice, who always took a great interest in questions of eating and drinking. 'They lived on treacle,' said the Dormouse, after thinking a minute or two. 'They couldn't have done that, you know,' Alice gently remarked; 'they'd have been ill.' 'So they were,' said the Dormouse; 'VERY ill.' Alice tried to fancy to herself what such an extraordinary ways of living would be like, but it puzzled her too much, so she went on: 'But why did they live at the bottom of a well?' 'Take some more tea,' the March Hare said to Alice, very earnestly. 'I've had nothing yet,' Alice replied in an offended tone, 'so I can't take more.' 'You mean you can't take LESS,' said the Hatter: 'it's very easy to take MORE than nothing.' 'Nobody asked YOUR opinion,' said Alice. 'Who's making personal remarks now?' the Hatter asked triumphantly. Alice did not quite know what to say to this: so she helped herself to some tea and bread-and-butter, and then turned to the Dormouse, and repeated her question. 'Why did they live at the bottom of a well?' The Dormouse again took a minute or two to think about it, and then said, 'It was a treacle-well.' 'There's no such thing!' Alice was beginning very angrily, but the Hatter and the March Hare went 'Sh! sh!' and the Dormouse sulkily remarked, 'If you can't be civil, you'd better finish the story for yourself.' 'No, please go on!' Alice said very humbly; 'I won't interrupt again. I dare say there may be ONE.' 'One, indeed!' said the Dormouse indignantly. However, he consented to ================================================ FILE: examples/input/alice_2.txt ================================================ go on. 'And so these three little sisters--they were learning to draw, you know--' 'What did they draw?' said Alice, quite forgetting her promise. 'Treacle,' said the Dormouse, without considering at all this time. 'I want a clean cup,' interrupted the Hatter: 'let's all move one place on.' He moved on as he spoke, and the Dormouse followed him: the March Hare moved into the Dormouse's place, and Alice rather unwillingly took the place of the March Hare. The Hatter was the only one who got any advantage from the change: and Alice was a good deal worse off than before, as the March Hare had just upset the milk-jug into his plate. Alice did not wish to offend the Dormouse again, so she began very cautiously: 'But I don't understand. Where did they draw the treacle from?' 'You can draw water out of a water-well,' said the Hatter; 'so I should think you could draw treacle out of a treacle-well--eh, stupid?' 'But they were IN the well,' Alice said to the Dormouse, not choosing to notice this last remark. 'Of course they were', said the Dormouse; '--well in.' This answer so confused poor Alice, that she let the Dormouse go on for some time without interrupting it. 'They were learning to draw,' the Dormouse went on, yawning and rubbing its eyes, for it was getting very sleepy; 'and they drew all manner of things--everything that begins with an M--' 'Why with an M?' said Alice. 'Why not?' said the March Hare. Alice was silent. The Dormouse had closed its eyes by this time, and was going off into a doze; but, on being pinched by the Hatter, it woke up again with a little shriek, and went on: '--that begins with an M, such as mouse-traps, and the moon, and memory, and muchness--you know you say things are "much of a muchness"--did you ever see such a thing as a drawing of a muchness?' 'Really, now you ask me,' said Alice, very much confused, 'I don't think--' 'Then you shouldn't talk,' said the Hatter. This piece of rudeness was more than Alice could bear: she got up in great disgust, and walked off; the Dormouse fell asleep instantly, and neither of the others took the least notice of her going, though she looked back once or twice, half hoping that they would call after her: the last time she saw them, they were trying to put the Dormouse into the teapot. 'At any rate I'll never go THERE again!' said Alice as she picked her way through the wood. 'It's the stupidest tea-party I ever was at in all my life!' Just as she said this, she noticed that one of the trees had a door leading right into it. 'That's very curious!' she thought. 'But everything's curious today. I think I may as well go in at once.' And in she went. Once more she found herself in the long hall, and close to the little glass table. 'Now, I'll manage better this time,' she said to herself, and began by taking the little golden key, and unlocking the door that led into the garden. Then she went to work nibbling at the mushroom (she had kept a piece of it in her pocket) till she was about a foot high: then she walked down the little passage: and THEN--she found herself at last in the beautiful garden, among the bright flower-beds and the cool fountains. CHAPTER VIII. The Queen's Croquet-Ground A large rose-tree stood near the entrance of the garden: the roses growing on it were white, but there were three gardeners at it, busily painting them red. Alice thought this a very curious thing, and she went nearer to watch them, and just as she came up to them she heard one of them say, 'Look out now, Five! Don't go splashing paint over me like that!' 'I couldn't help it,' said Five, in a sulky tone; 'Seven jogged my elbow.' On which Seven looked up and said, 'That's right, Five! Always lay the blame on others!' 'YOU'D better not talk!' said Five. 'I heard the Queen say only yesterday you deserved to be beheaded!' 'What for?' said the one who had spoken first. 'That's none of YOUR business, Two!' said Seven. 'Yes, it IS his business!' said Five, 'and I'll tell him--it was for bringing the cook tulip-roots instead of onions.' Seven flung down his brush, and had just begun 'Well, of all the unjust things--' when his eye chanced to fall upon Alice, as she stood watching them, and he checked himself suddenly: the others looked round also, and all of them bowed low. 'Would you tell me,' said Alice, a little timidly, 'why you are painting those roses?' Five and Seven said nothing, but looked at Two. Two began in a low voice, 'Why the fact is, you see, Miss, this here ought to have been a RED rose-tree, and we put a white one in by mistake; and if the Queen was to find it out, we should all have our heads cut off, you know. So you see, Miss, we're doing our best, afore she comes, to--' At this moment Five, who had been anxiously looking across the garden, called out 'The Queen! The Queen!' and the three gardeners instantly threw themselves flat upon their faces. There was a sound of many footsteps, and Alice looked round, eager to see the Queen. First came ten soldiers carrying clubs; these were all shaped like the three gardeners, oblong and flat, with their hands and feet at the corners: next the ten courtiers; these were ornamented all over with diamonds, and walked two and two, as the soldiers did. After these came the royal children; there were ten of them, and the little dears came jumping merrily along hand in hand, in couples: they were all ornamented with hearts. Next came the guests, mostly Kings and Queens, and among them Alice recognised the White Rabbit: it was talking in a hurried nervous manner, smiling at everything that was said, and went by without noticing her. Then followed the Knave of Hearts, carrying the King's crown on a crimson velvet cushion; and, last of all this grand procession, came THE KING AND QUEEN OF HEARTS. Alice was rather doubtful whether she ought not to lie down on her face like the three gardeners, but she could not remember ever having heard of such a rule at processions; 'and besides, what would be the use of a procession,' thought she, 'if people had all to lie down upon their faces, so that they couldn't see it?' So she stood still where she was, and waited. When the procession came opposite to Alice, they all stopped and looked at her, and the Queen said severely 'Who is this?' She said it to the Knave of Hearts, who only bowed and smiled in reply. 'Idiot!' said the Queen, tossing her head impatiently; and, turning to Alice, she went on, 'What's your name, child?' 'My name is Alice, so please your Majesty,' said Alice very politely; but she added, to herself, 'Why, they're only a pack of cards, after all. I needn't be afraid of them!' 'And who are THESE?' said the Queen, pointing to the three gardeners who were lying round the rosetree; for, you see, as they were lying on their faces, and the pattern on their backs was the same as the rest of the pack, she could not tell whether they were gardeners, or soldiers, or courtiers, or three of her own children. 'How should I know?' said Alice, surprised at her own courage. 'It's no business of MINE.' The Queen turned crimson with fury, and, after glaring at her for a moment like a wild beast, screamed 'Off with her head! Off--' 'Nonsense!' said Alice, very loudly and decidedly, and the Queen was silent. The King laid his hand upon her arm, and timidly said 'Consider, my dear: she is only a child!' The Queen turned angrily away from him, and said to the Knave 'Turn them over!' The Knave did so, very carefully, with one foot. 'Get up!' said the Queen, in a shrill, loud voice, and the three gardeners instantly jumped up, and began bowing to the King, the Queen, the royal children, and everybody else. 'Leave off that!' screamed the Queen. 'You make me giddy.' And then, turning to the rose-tree, she went on, 'What HAVE you been doing here?' 'May it please your Majesty,' said Two, in a very humble tone, going down on one knee as he spoke, 'we were trying--' 'I see!' said the Queen, who had meanwhile been examining the roses. 'Off with their heads!' and the procession moved on, three of the soldiers remaining behind to execute the unfortunate gardeners, who ran to Alice for protection. 'You shan't be beheaded!' said Alice, and she put them into a large flower-pot that stood near. The three soldiers wandered about for a minute or two, looking for them, and then quietly marched off after the others. 'Are their heads off?' shouted the Queen. 'Their heads are gone, if it please your Majesty!' the soldiers shouted in reply. 'That's right!' shouted the Queen. 'Can you play croquet?' The soldiers were silent, and looked at Alice, as the question was evidently meant for her. 'Yes!' shouted Alice. 'Come on, then!' roared the Queen, and Alice joined the procession, wondering very much what would happen next. 'It's--it's a very fine day!' said a timid voice at her side. She was walking by the White Rabbit, who was peeping anxiously into her face. 'Very,' said Alice: '--where's the Duchess?' 'Hush! Hush!' said the Rabbit in a low, hurried tone. He looked anxiously over his shoulder as he spoke, and then raised himself upon tiptoe, put his mouth close to her ear, and whispered 'She's under sentence of execution.' 'What for?' said Alice. 'Did you say "What a pity!"?' the Rabbit asked. 'No, I didn't,' said Alice: 'I don't think it's at all a pity. I said "What for?"' 'She boxed the Queen's ears--' the Rabbit began. Alice gave a little scream of laughter. 'Oh, hush!' the Rabbit whispered in a frightened tone. 'The Queen will hear you! You see, she came rather late, and the Queen said--' 'Get to your places!' shouted the Queen in a voice of thunder, and people began running about in all directions, tumbling up against each other; however, they got settled down in a minute or two, and the game began. Alice thought she had never seen such a curious croquet-ground in her life; it was all ridges and furrows; the balls were live hedgehogs, the mallets live flamingoes, and the soldiers had to double themselves up and to stand on their hands and feet, to make the arches. The chief difficulty Alice found at first was in managing her flamingo: she succeeded in getting its body tucked away, comfortably enough, under her arm, with its legs hanging down, but generally, just as she had got its neck nicely straightened out, and was going to give the hedgehog a blow with its head, it WOULD twist itself round and look up in her face, with such a puzzled expression that she could not help bursting out laughing: and when she had got its head down, and was going to begin again, it was very provoking to find that the hedgehog had unrolled itself, and was in the act of crawling away: besides all this, there was generally a ridge or furrow in the way wherever she wanted to send the hedgehog to, and, as the doubled-up soldiers were always getting up and walking off to other parts of the ground, Alice soon came to the conclusion that it was a very difficult game indeed. The players all played at once without waiting for turns, quarrelling all the while, and fighting for the hedgehogs; and in a very short time the Queen was in a furious passion, and went stamping about, and shouting 'Off with his head!' or 'Off with her head!' about once in a minute. Alice began to feel very uneasy: to be sure, she had not as yet had any dispute with the Queen, but she knew that it might happen any minute, 'and then,' thought she, 'what would become of me? They're dreadfully fond of beheading people here; the great wonder is, that there's any one left alive!' She was looking about for some way of escape, and wondering whether she could get away without being seen, when she noticed a curious appearance in the air: it puzzled her very much at first, but, after watching it a minute or two, she made it out to be a grin, and she said to herself 'It's the Cheshire Cat: now I shall have somebody to talk to.' 'How are you getting on?' said the Cat, as soon as there was mouth enough for it to speak with. Alice waited till the eyes appeared, and then nodded. 'It's no use speaking to it,' she thought, 'till its ears have come, or at least one of them.' In another minute the whole head appeared, and then Alice put down her flamingo, and began an account of the game, feeling very glad she had someone to listen to her. The Cat seemed to think that there was enough of it now in sight, and no more of it appeared. 'I don't think they play at all fairly,' Alice began, in rather a complaining tone, 'and they all quarrel so dreadfully one can't hear oneself speak--and they don't seem to have any rules in particular; at least, if there are, nobody attends to them--and you've no idea how confusing it is all the things being alive; for instance, there's the arch I've got to go through next walking about at the other end of the ground--and I should have croqueted the Queen's hedgehog just now, only it ran away when it saw mine coming!' 'How do you like the Queen?' said the Cat in a low voice. 'Not at all,' said Alice: 'she's so extremely--' Just then she noticed that the Queen was close behind her, listening: so she went on, '--likely to win, that it's hardly worth while finishing the game.' The Queen smiled and passed on. 'Who ARE you talking to?' said the King, going up to Alice, and looking at the Cat's head with great curiosity. 'It's a friend of mine--a Cheshire Cat,' said Alice: 'allow me to introduce it.' 'I don't like the look of it at all,' said the King: 'however, it may kiss my hand if it likes.' 'I'd rather not,' the Cat remarked. 'Don't be impertinent,' said the King, 'and don't look at me like that!' He got behind Alice as he spoke. 'A cat may look at a king,' said Alice. 'I've read that in some book, but I don't remember where.' 'Well, it must be removed,' said the King very decidedly, and he called the Queen, who was passing at the moment, 'My dear! I wish you would have this cat removed!' The Queen had only one way of settling all difficulties, great or small. 'Off with his head!' she said, without even looking round. 'I'll fetch the executioner myself,' said the King eagerly, and he hurried off. Alice thought she might as well go back, and see how the game was going on, as she heard the Queen's voice in the distance, screaming with passion. She had already heard her sentence three of the players to be executed for having missed their turns, and she did not like the look of things at all, as the game was in such confusion that she never knew whether it was her turn or not. So she went in search of her hedgehog. The hedgehog was engaged in a fight with another hedgehog, which seemed to Alice an excellent opportunity for croqueting one of them with the other: the only difficulty was, that her flamingo was gone across to the other side of the garden, where Alice could see it trying in a helpless sort of way to fly up into a tree. By the time she had caught the flamingo and brought it back, the fight was over, and both the hedgehogs were out of sight: 'but it doesn't matter much,' thought Alice, 'as all the arches are gone from this side of the ground.' So she tucked it away under her arm, that it might not escape again, and went back for a little more conversation with her friend. When she got back to the Cheshire Cat, she was surprised to find quite a large crowd collected round it: there was a dispute going on between the executioner, the King, and the Queen, who were all talking at once, while all the rest were quite silent, and looked very uncomfortable. The moment Alice appeared, she was appealed to by all three to settle the question, and they repeated their arguments to her, though, as they all spoke at once, she found it very hard indeed to make out exactly what they said. The executioner's argument was, that you couldn't cut off a head unless there was a body to cut it off from: that he had never had to do such a thing before, and he wasn't going to begin at HIS time of life. The King's argument was, that anything that had a head could be beheaded, and that you weren't to talk nonsense. The Queen's argument was, that if something wasn't done about it in less than no time she'd have everybody executed, all round. (It was this last remark that had made the whole party look so grave and anxious.) Alice could think of nothing else to say but 'It belongs to the Duchess: you'd better ask HER about it.' 'She's in prison,' the Queen said to the executioner: 'fetch her here.' And the executioner went off like an arrow. The Cat's head began fading away the moment he was gone, and, by the time he had come back with the Duchess, it had entirely disappeared; so the King and the executioner ran wildly up and down looking for it, while the rest of the party went back to the game. CHAPTER IX. The Mock Turtle's Story 'You can't think how glad I am to see you again, you dear old thing!' said the Duchess, as she tucked her arm affectionately into Alice's, and they walked off together. Alice was very glad to find her in such a pleasant temper, and thought to herself that perhaps it was only the pepper that had made her so savage when they met in the kitchen. 'When I'M a Duchess,' she said to herself, (not in a very hopeful tone though), 'I won't have any pepper in my kitchen AT ALL. Soup does very well without--Maybe it's always pepper that makes people hot-tempered,' she went on, very much pleased at having found out a new kind of rule, 'and vinegar that makes them sour--and camomile that makes them bitter--and--and barley-sugar and such things that make children sweet-tempered. I only wish people knew that: then they wouldn't be so stingy about it, you know--' She had quite forgotten the Duchess by this time, and was a little startled when she heard her voice close to her ear. 'You're thinking about something, my dear, and that makes you forget to talk. I can't tell you just now what the moral of that is, but I shall remember it in a bit.' 'Perhaps it hasn't one,' Alice ventured to remark. 'Tut, tut, child!' said the Duchess. 'Everything's got a moral, if only you can find it.' And she squeezed herself up closer to Alice's side as she spoke. Alice did not much like keeping so close to her: first, because the Duchess was VERY ugly; and secondly, because she was exactly the right height to rest her chin upon Alice's shoulder, and it was an uncomfortably sharp chin. However, she did not like to be rude, so she bore it as well as she could. 'The game's going on rather better now,' she said, by way of keeping up the conversation a little. ''Tis so,' said the Duchess: 'and the moral of that is--"Oh, 'tis love, 'tis love, that makes the world go round!"' 'Somebody said,' Alice whispered, 'that it's done by everybody minding their own business!' 'Ah, well! It means much the same thing,' said the Duchess, digging her sharp little chin into Alice's shoulder as she added, 'and the moral of THAT is--"Take care of the sense, and the sounds will take care of themselves."' 'How fond she is of finding morals in things!' Alice thought to herself. 'I dare say you're wondering why I don't put my arm round your waist,' the Duchess said after a pause: 'the reason is, that I'm doubtful about the temper of your flamingo. Shall I try the experiment?' 'HE might bite,' Alice cautiously replied, not feeling at all anxious to have the experiment tried. 'Very true,' said the Duchess: 'flamingoes and mustard both bite. And the moral of that is--"Birds of a feather flock together."' 'Only mustard isn't a bird,' Alice remarked. 'Right, as usual,' said the Duchess: 'what a clear way you have of putting things!' 'It's a mineral, I THINK,' said Alice. 'Of course it is,' said the Duchess, who seemed ready to agree to everything that Alice said; 'there's a large mustard-mine near here. And the moral of that is--"The more there is of mine, the less there is of yours."' 'Oh, I know!' exclaimed Alice, who had not attended to this last remark, 'it's a vegetable. It doesn't look like one, but it is.' 'I quite agree with you,' said the Duchess; 'and the moral of that is--"Be what you would seem to be"--or if you'd like it put more simply--"Never imagine yourself not to be otherwise than what it might appear to others that what you were or might have been was not otherwise than what you had been would have appeared to them to be otherwise."' 'I think I should understand that better,' Alice said very politely, 'if I had it written down: but I can't quite follow it as you say it.' 'That's nothing to what I could say if I chose,' the Duchess replied, in a pleased tone. 'Pray don't trouble yourself to say it any longer than that,' said Alice. 'Oh, don't talk about trouble!' said the Duchess. 'I make you a present of everything I've said as yet.' 'A cheap sort of present!' thought Alice. 'I'm glad they don't give birthday presents like that!' But she did not venture to say it out loud. 'Thinking again?' the Duchess asked, with another dig of her sharp little chin. 'I've a right to think,' said Alice sharply, for she was beginning to feel a little worried. 'Just about as much right,' said the Duchess, 'as pigs have to fly; and the m--' But here, to Alice's great surprise, the Duchess's voice died away, even in the middle of her favourite word 'moral,' and the arm that was linked into hers began to tremble. Alice looked up, and there stood the Queen in front of them, with her arms folded, frowning like a thunderstorm. 'A fine day, your Majesty!' the Duchess began in a low, weak voice. 'Now, I give you fair warning,' shouted the Queen, stamping on the ground as she spoke; 'either you or your head must be off, and that in about half no time! Take your choice!' The Duchess took her choice, and was gone in a moment. 'Let's go on with the game,' the Queen said to Alice; and Alice was too much frightened to say a word, but slowly followed her back to the croquet-ground. The other guests had taken advantage of the Queen's absence, and were resting in the shade: however, the moment they saw her, they hurried back to the game, the Queen merely remarking that a moment's delay would cost them their lives. All the time they were playing the Queen never left off quarrelling with the other players, and shouting 'Off with his head!' or 'Off with her head!' Those whom she sentenced were taken into custody by the soldiers, who of course had to leave off being arches to do this, so that by the end of half an hour or so there were no arches left, and all the players, except the King, the Queen, and Alice, were in custody and under sentence of execution. Then the Queen left off, quite out of breath, and said to Alice, 'Have you seen the Mock Turtle yet?' 'No,' said Alice. 'I don't even know what a Mock Turtle is.' 'It's the thing Mock Turtle Soup is made from,' said the Queen. 'I never saw one, or heard of one,' said Alice. 'Come on, then,' said the Queen, 'and he shall tell you his history,' As they walked off together, Alice heard the King say in a low voice, to the company generally, 'You are all pardoned.' 'Come, THAT'S a good thing!' she said to herself, for she had felt quite unhappy at the number of executions the Queen had ordered. They very soon came upon a Gryphon, lying fast asleep in the sun. (IF you don't know what a Gryphon is, look at the picture.) 'Up, lazy thing!' said the Queen, 'and take this young lady to see the Mock Turtle, and to hear his history. I must go back and see after some executions I have ordered'; and she walked off, leaving Alice alone with the Gryphon. Alice did not quite like the look of the creature, but on the whole she thought it would be quite as safe to stay with it as to go after that savage Queen: so she waited. The Gryphon sat up and rubbed its eyes: then it watched the Queen till she was out of sight: then it chuckled. 'What fun!' said the Gryphon, half to itself, half to Alice. 'What IS the fun?' said Alice. 'Why, SHE,' said the Gryphon. 'It's all her fancy, that: they never executes nobody, you know. Come on!' 'Everybody says "come on!" here,' thought Alice, as she went slowly after it: 'I never was so ordered about in all my life, never!' They had not gone far before they saw the Mock Turtle in the distance, sitting sad and lonely on a little ledge of rock, and, as they came nearer, Alice could hear him sighing as if his heart would break. She pitied him deeply. 'What is his sorrow?' she asked the Gryphon, and the Gryphon answered, very nearly in the same words as before, 'It's all his fancy, that: he hasn't got no sorrow, you know. Come on!' So they went up to the Mock Turtle, who looked at them with large eyes full of tears, but said nothing. 'This here young lady,' said the Gryphon, 'she wants for to know your history, she do.' 'I'll tell it her,' said the Mock Turtle in a deep, hollow tone: 'sit down, both of you, and don't speak a word till I've finished.' So they sat down, and nobody spoke for some minutes. Alice thought to herself, 'I don't see how he can EVEN finish, if he doesn't begin.' But she waited patiently. 'Once,' said the Mock Turtle at last, with a deep sigh, 'I was a real Turtle.' These words were followed by a very long silence, broken only by an occasional exclamation of 'Hjckrrh!' from the Gryphon, and the constant heavy sobbing of the Mock Turtle. Alice was very nearly getting up and saying, 'Thank you, sir, for your interesting story,' but she could not help thinking there MUST be more to come, so she sat still and said nothing. 'When we were little,' the Mock Turtle went on at last, more calmly, though still sobbing a little now and then, 'we went to school in the sea. The master was an old Turtle--we used to call him Tortoise--' 'Why did you call him Tortoise, if he wasn't one?' Alice asked. 'We called him Tortoise because he taught us,' said the Mock Turtle angrily: 'really you are very dull!' 'You ought to be ashamed of yourself for asking such a simple question,' added the Gryphon; and then they both sat silent and looked at poor Alice, who felt ready to sink into the earth. At last the Gryphon said to the Mock Turtle, 'Drive on, old fellow! Don't be all day about it!' and he went on in these words: 'Yes, we went to school in the sea, though you mayn't believe it--' 'I never said I didn't!' interrupted Alice. 'You did,' said the Mock Turtle. 'Hold your tongue!' added the Gryphon, before Alice could speak again. The Mock Turtle went on. 'We had the best of educations--in fact, we went to school every day--' 'I'VE been to a day-school, too,' said Alice; 'you needn't be so proud as all that.' 'With extras?' asked the Mock Turtle a little anxiously. 'Yes,' said Alice, 'we learned French and music.' 'And washing?' said the Mock Turtle. 'Certainly not!' said Alice indignantly. 'Ah! then yours wasn't a really good school,' said the Mock Turtle in a tone of great relief. 'Now at OURS they had at the end of the bill, "French, music, AND WASHING--extra."' 'You couldn't have wanted it much,' said Alice; 'living at the bottom of the sea.' 'I couldn't afford to learn it.' said the Mock Turtle with a sigh. 'I only took the regular course.' 'What was that?' inquired Alice. 'Reeling and Writhing, of course, to begin with,' the Mock Turtle replied; 'and then the different branches of Arithmetic--Ambition, Distraction, Uglification, and Derision.' 'I never heard of "Uglification,"' Alice ventured to say. 'What is it?' The Gryphon lifted up both its paws in surprise. 'What! Never heard of uglifying!' it exclaimed. 'You know what to beautify is, I suppose?' 'Yes,' said Alice doubtfully: 'it means--to--make--anything--prettier.' 'Well, then,' the Gryphon went on, 'if you don't know what to uglify is, you ARE a simpleton.' Alice did not feel encouraged to ask any more questions about it, so she turned to the Mock Turtle, and said 'What else had you to learn?' 'Well, there was Mystery,' the Mock Turtle replied, counting off the subjects on his flappers, '--Mystery, ancient and modern, with Seaography: then Drawling--the Drawling-master was an old conger-eel, that used to come once a week: HE taught us Drawling, Stretching, and Fainting in Coils.' 'What was THAT like?' said Alice. 'Well, I can't show it you myself,' the Mock Turtle said: 'I'm too stiff. And the Gryphon never learnt it.' 'Hadn't time,' said the Gryphon: 'I went to the Classics master, though. He was an old crab, HE was.' 'I never went to him,' the Mock Turtle said with a sigh: 'he taught Laughing and Grief, they used to say.' 'So he did, so he did,' said the Gryphon, sighing in his turn; and both creatures hid their faces in their paws. 'And how many hours a day did you do lessons?' said Alice, in a hurry to change the subject. 'Ten hours the first day,' said the Mock Turtle: 'nine the next, and so on.' 'What a curious plan!' exclaimed Alice. 'That's the reason they're called lessons,' the Gryphon remarked: 'because they lessen from day to day.' This was quite a new idea to Alice, and she thought it over a little before she made her next remark. 'Then the eleventh day must have been a holiday?' 'Of course it was,' said the Mock Turtle. 'And how did you manage on the twelfth?' Alice went on eagerly. 'That's enough about lessons,' the Gryphon interrupted in a very decided tone: 'tell her something about the games now.' CHAPTER X. The Lobster Quadrille The Mock Turtle sighed deeply, and drew the back of one flapper across his eyes. He looked at Alice, and tried to speak, but for a minute or two sobs choked his voice. 'Same as if he had a bone in his throat,' said the Gryphon: and it set to work shaking him and punching him in the back. At last the Mock Turtle recovered his voice, and, with tears running down his cheeks, he went on again:-- 'You may not have lived much under the sea--' ('I haven't,' said Alice)--'and perhaps you were never even introduced to a lobster--' (Alice began to say 'I once tasted--' but checked herself hastily, and said 'No, never') '--so you can have no idea what a delightful thing a Lobster Quadrille is!' 'No, indeed,' said Alice. 'What sort of a dance is it?' 'Why,' said the Gryphon, 'you first form into a line along the sea-shore--' 'Two lines!' cried the Mock Turtle. 'Seals, turtles, salmon, and so on; then, when you've cleared all the jelly-fish out of the way--' 'THAT generally takes some time,' interrupted the Gryphon. '--you advance twice--' 'Each with a lobster as a partner!' cried the Gryphon. 'Of course,' the Mock Turtle said: 'advance twice, set to partners--' '--change lobsters, and retire in same order,' continued the Gryphon. 'Then, you know,' the Mock Turtle went on, 'you throw the--' 'The lobsters!' shouted the Gryphon, with a bound into the air. '--as far out to sea as you can--' 'Swim after them!' screamed the Gryphon. 'Turn a somersault in the sea!' cried the Mock Turtle, capering wildly about. 'Change lobsters again!' yelled the Gryphon at the top of its voice. 'Back to land again, and that's all the first figure,' said the Mock Turtle, suddenly dropping his voice; and the two creatures, who had been jumping about like mad things all this time, sat down again very sadly and quietly, and looked at Alice. 'It must be a very pretty dance,' said Alice timidly. 'Would you like to see a little of it?' said the Mock Turtle. 'Very much indeed,' said Alice. 'Come, let's try the first figure!' said the Mock Turtle to the Gryphon. 'We can do without lobsters, you know. Which shall sing?' 'Oh, YOU sing,' said the Gryphon. 'I've forgotten the words.' So they began solemnly dancing round and round Alice, every now and then treading on her toes when they passed too close, and waving their forepaws to mark the time, while the Mock Turtle sang this, very slowly and sadly:-- '"Will you walk a little faster?" said a whiting to a snail. "There's a porpoise close behind us, and he's treading on my tail. See how eagerly the lobsters and the turtles all advance! They are waiting on the shingle--will you come and join the dance? Will you, won't you, will you, won't you, will you join the dance? Will you, won't you, will you, won't you, won't you join the dance? "You can really have no notion how delightful it will be When they take us up and throw us, with the lobsters, out to sea!" But the snail replied "Too far, too far!" and gave a look askance-- Said he thanked the whiting kindly, but he would not join the dance. Would not, could not, would not, could not, would not join the dance. Would not, could not, would not, could not, could not join the dance. '"What matters it how far we go?" his scaly friend replied. "There is another shore, you know, upon the other side. The further off from England the nearer is to France-- Then turn not pale, beloved snail, but come and join the dance. Will you, won't you, will you, won't you, will you join the dance? Will you, won't you, will you, won't you, won't you join the dance?"' 'Thank you, it's a very interesting dance to watch,' said Alice, feeling very glad that it was over at last: 'and I do so like that curious song about the whiting!' 'Oh, as to the whiting,' said the Mock Turtle, 'they--you've seen them, of course?' 'Yes,' said Alice, 'I've often seen them at dinn--' she checked herself hastily. 'I don't know where Dinn may be,' said the Mock Turtle, 'but if you've seen them so often, of course you know what they're like.' 'I believe so,' Alice replied thoughtfully. 'They have their tails in their mouths--and they're all over crumbs.' 'You're wrong about the crumbs,' said the Mock Turtle: 'crumbs would all wash off in the sea. But they HAVE their tails in their mouths; and the reason is--' here the Mock Turtle yawned and shut his eyes.--'Tell her about the reason and all that,' he said to the Gryphon. 'The reason is,' said the Gryphon, 'that they WOULD go with the lobsters to the dance. So they got thrown out to sea. So they had to fall a long way. So they got their tails fast in their mouths. So they couldn't get them out again. That's all.' 'Thank you,' said Alice, 'it's very interesting. I never knew so much about a whiting before.' 'I can tell you more than that, if you like,' said the Gryphon. 'Do you know why it's called a whiting?' 'I never thought about it,' said Alice. 'Why?' 'IT DOES THE BOOTS AND SHOES.' the Gryphon replied very solemnly. Alice was thoroughly puzzled. 'Does the boots and shoes!' she repeated in a wondering tone. 'Why, what are YOUR shoes done with?' said the Gryphon. 'I mean, what makes them so shiny?' Alice looked down at them, and considered a little before she gave her answer. 'They're done with blacking, I believe.' 'Boots and shoes under the sea,' the Gryphon went on in a deep voice, 'are done with a whiting. Now you know.' 'And what are they made of?' Alice asked in a tone of great curiosity. 'Soles and eels, of course,' the Gryphon replied rather impatiently: 'any shrimp could have told you that.' 'If I'd been the whiting,' said Alice, whose thoughts were still running on the song, 'I'd have said to the porpoise, "Keep back, please: we don't want YOU with us!"' 'They were obliged to have him with them,' the Mock Turtle said: 'no wise fish would go anywhere without a porpoise.' 'Wouldn't it really?' said Alice in a tone of great surprise. 'Of course not,' said the Mock Turtle: 'why, if a fish came to ME, and told me he was going a journey, I should say "With what porpoise?"' 'Don't you mean "purpose"?' said Alice. 'I mean what I say,' the Mock Turtle replied in an offended tone. And the Gryphon added 'Come, let's hear some of YOUR adventures.' 'I could tell you my adventures--beginning from this morning,' said Alice a little timidly: 'but it's no use going back to yesterday, because I was a different person then.' 'Explain all that,' said the Mock Turtle. 'No, no! The adventures first,' said the Gryphon in an impatient tone: 'explanations take such a dreadful time.' So Alice began telling them her adventures from the time when she first saw the White Rabbit. She was a little nervous about it just at first, the two creatures got so close to her, one on each side, and opened their eyes and mouths so VERY wide, but she gained courage as she went on. Her listeners were perfectly quiet till she got to the part about her repeating 'YOU ARE OLD, FATHER WILLIAM,' to the Caterpillar, and the words all coming different, and then the Mock Turtle drew a long breath, and said 'That's very curious.' 'It's all about as curious as it can be,' said the Gryphon. 'It all came different!' the Mock Turtle repeated thoughtfully. 'I should like to hear her try and repeat something now. Tell her to begin.' He looked at the Gryphon as if he thought it had some kind of authority over Alice. 'Stand up and repeat "'TIS THE VOICE OF THE SLUGGARD,"' said the Gryphon. 'How the creatures order one about, and make one repeat lessons!' thought Alice; 'I might as well be at school at once.' However, she got up, and began to repeat it, but her head was so full of the Lobster Quadrille, that she hardly knew what she was saying, and the words came very queer indeed:-- ''Tis the voice of the Lobster; I heard him declare, "You have baked me too brown, I must sugar my hair." As a duck with its eyelids, so he with his nose Trims his belt and his buttons, and turns out his toes.' [later editions continued as follows When the sands are all dry, he is gay as a lark, And will talk in contemptuous tones of the Shark, But, when the tide rises and sharks are around, His voice has a timid and tremulous sound.] 'That's different from what I used to say when I was a child,' said the Gryphon. 'Well, I never heard it before,' said the Mock Turtle; 'but it sounds uncommon nonsense.' Alice said nothing; she had sat down with her face in her hands, wondering if anything would EVER happen in a natural way again. 'I should like to have it explained,' said the Mock Turtle. 'She can't explain it,' said the Gryphon hastily. 'Go on with the next verse.' 'But about his toes?' the Mock Turtle persisted. 'How COULD he turn them out with his nose, you know?' 'It's the first position in dancing.' Alice said; but was dreadfully puzzled by the whole thing, and longed to change the subject. 'Go on with the next verse,' the Gryphon repeated impatiently: 'it begins "I passed by his garden."' Alice did not dare to disobey, though she felt sure it would all come wrong, and she went on in a trembling voice:-- 'I passed by his garden, and marked, with one eye, How the Owl and the Panther were sharing a pie--' [later editions continued as follows The Panther took pie-crust, and gravy, and meat, While the Owl had the dish as its share of the treat. When the pie was all finished, the Owl, as a boon, Was kindly permitted to pocket the spoon: While the Panther received knife and fork with a growl, And concluded the banquet--] 'What IS the use of repeating all that stuff,' the Mock Turtle interrupted, 'if you don't explain it as you go on? It's by far the most confusing thing I ever heard!' 'Yes, I think you'd better leave off,' said the Gryphon: and Alice was only too glad to do so. 'Shall we try another figure of the Lobster Quadrille?' the Gryphon went on. 'Or would you like the Mock Turtle to sing you a song?' 'Oh, a song, please, if the Mock Turtle would be so kind,' Alice replied, so eagerly that the Gryphon said, in a rather offended tone, 'Hm! No accounting for tastes! Sing her "Turtle Soup," will you, old fellow?' The Mock Turtle sighed deeply, and began, in a voice sometimes choked with sobs, to sing this:-- 'Beautiful Soup, so rich and green, Waiting in a hot tureen! Who for such dainties would not stoop? Soup of the evening, beautiful Soup! Soup of the evening, beautiful Soup! Beau--ootiful Soo--oop! Beau--ootiful Soo--oop! Soo--oop of the e--e--evening, Beautiful, beautiful Soup! 'Beautiful Soup! Who cares for fish, Game, or any other dish? Who would not give all else for two Pennyworth only of beautiful Soup? Pennyworth only of beautiful Soup? Beau--ootiful Soo--oop! Beau--ootiful Soo--oop! Soo--oop of the e--e--evening, Beautiful, beauti--FUL SOUP!' 'Chorus again!' cried the Gryphon, and the Mock Turtle had just begun to repeat it, when a cry of 'The trial's beginning!' was heard in the distance. 'Come on!' cried the Gryphon, and, taking Alice by the hand, it hurried off, without waiting for the end of the song. 'What trial is it?' Alice panted as she ran; but the Gryphon only answered 'Come on!' and ran the faster, while more and more faintly came, carried on the breeze that followed them, the melancholy words:-- 'Soo--oop of the e--e--evening, Beautiful, beautiful Soup!' CHAPTER XI. Who Stole the Tarts? The King and Queen of Hearts were seated on their throne when they arrived, with a great crowd assembled about them--all sorts of little birds and beasts, as well as the whole pack of cards: the Knave was standing before them, in chains, with a soldier on each side to guard him; and near the King was the White Rabbit, with a trumpet in one hand, and a scroll of parchment in the other. In the very middle of the court was a table, with a large dish of tarts upon it: they looked so good, that it made Alice quite hungry to look at them--'I wish they'd get the trial done,' she thought, 'and hand round the refreshments!' But there seemed to be no chance of this, so she began looking at everything about her, to pass away the time. Alice had never been in a court of justice before, but she had read about them in books, and she was quite pleased to find that she knew the name of nearly everything there. 'That's the judge,' she said to herself, 'because of his great wig.' The judge, by the way, was the King; and as he wore his crown over the wig, (look at the frontispiece if you want to see how he did it,) he did not look at all comfortable, and it was certainly not becoming. 'And that's the jury-box,' thought Alice, 'and those twelve creatures,' (she was obliged to say 'creatures,' you see, because some of them were animals, and some were birds,) 'I suppose they are the jurors.' She said this last word two or three times over to herself, being rather proud of it: for she thought, and rightly too, that very few little girls of her age knew the meaning of it at all. However, 'jury-men' would have done just as well. The twelve jurors were all writing very busily on slates. 'What are they doing?' Alice whispered to the Gryphon. 'They can't have anything to put down yet, before the trial's begun.' 'They're putting down their names,' the Gryphon whispered in reply, 'for fear they should forget them before the end of the trial.' 'Stupid things!' Alice began in a loud, indignant voice, but she stopped hastily, for the White Rabbit cried out, 'Silence in the court!' and the King put on his spectacles and looked anxiously round, to make out who was talking. Alice could see, as well as if she were looking over their shoulders, that all the jurors were writing down 'stupid things!' on their slates, and she could even make out that one of them didn't know how to spell 'stupid,' and that he had to ask his neighbour to tell him. 'A nice muddle their slates'll be in before the trial's over!' thought Alice. One of the jurors had a pencil that squeaked. This of course, Alice could not stand, and she went round the court and got behind him, and very soon found an opportunity of taking it away. She did it so quickly that the poor little juror (it was Bill, the Lizard) could not make out at all what had become of it; so, after hunting all about for it, he was obliged to write with one finger for the rest of the day; and this was of very little use, as it left no mark on the slate. 'Herald, read the accusation!' said the King. On this the White Rabbit blew three blasts on the trumpet, and then unrolled the parchment scroll, and read as follows:-- 'The Queen of Hearts, she made some tarts, All on a summer day: The Knave of Hearts, he stole those tarts, And took them quite away!' 'Consider your verdict,' the King said to the jury. 'Not yet, not yet!' the Rabbit hastily interrupted. 'There's a great deal to come before that!' 'Call the first witness,' said the King; and the White Rabbit blew three blasts on the trumpet, and called out, 'First witness!' The first witness was the Hatter. He came in with a teacup in one hand and a piece of bread-and-butter in the other. 'I beg pardon, your Majesty,' he began, 'for bringing these in: but I hadn't quite finished my tea when I was sent for.' 'You ought to have finished,' said the King. 'When did you begin?' The Hatter looked at the March Hare, who had followed him into the court, arm-in-arm with the Dormouse. 'Fourteenth of March, I think it was,' he said. 'Fifteenth,' said the March Hare. 'Sixteenth,' added the Dormouse. 'Write that down,' the King said to the jury, and the jury eagerly wrote down all three dates on their slates, and then added them up, and reduced the answer to shillings and pence. 'Take off your hat,' the King said to the Hatter. 'It isn't mine,' said the Hatter. 'Stolen!' the King exclaimed, turning to the jury, who instantly made a memorandum of the fact. 'I keep them to sell,' the Hatter added as an explanation; 'I've none of my own. I'm a hatter.' Here the Queen put on her spectacles, and began staring at the Hatter, who turned pale and fidgeted. 'Give your evidence,' said the King; 'and don't be nervous, or I'll have you executed on the spot.' This did not seem to encourage the witness at all: he kept shifting from one foot to the other, looking uneasily at the Queen, and in his confusion he bit a large piece out of his teacup instead of the bread-and-butter. Just at this moment Alice felt a very curious sensation, which puzzled her a good deal until she made out what it was: she was beginning to grow larger again, and she thought at first she would get up and leave the court; but on second thoughts she decided to remain where she was as long as there was room for her. 'I wish you wouldn't squeeze so.' said the Dormouse, who was sitting next to her. 'I can hardly breathe.' 'I can't help it,' said Alice very meekly: 'I'm growing.' 'You've no right to grow here,' said the Dormouse. 'Don't talk nonsense,' said Alice more boldly: 'you know you're growing too.' 'Yes, but I grow at a reasonable pace,' said the Dormouse: 'not in that ridiculous fashion.' And he got up very sulkily and crossed over to the other side of the court. All this time the Queen had never left off staring at the Hatter, and, just as the Dormouse crossed the court, she said to one of the officers of the court, 'Bring me the list of the singers in the last concert!' on which the wretched Hatter trembled so, that he shook both his shoes off. 'Give your evidence,' the King repeated angrily, 'or I'll have you executed, whether you're nervous or not.' 'I'm a poor man, your Majesty,' the Hatter began, in a trembling voice, '--and I hadn't begun my tea--not above a week or so--and what with the bread-and-butter getting so thin--and the twinkling of the tea--' 'The twinkling of the what?' said the King. 'It began with the tea,' the Hatter replied. 'Of course twinkling begins with a T!' said the King sharply. 'Do you take me for a dunce? Go on!' 'I'm a poor man,' the Hatter went on, 'and most things twinkled after that--only the March Hare said--' 'I didn't!' the March Hare interrupted in a great hurry. 'You did!' said the Hatter. 'I deny it!' said the March Hare. 'He denies it,' said the King: 'leave out that part.' 'Well, at any rate, the Dormouse said--' the Hatter went on, looking anxiously round to see if he would deny it too: but the Dormouse denied nothing, being fast asleep. 'After that,' continued the Hatter, 'I cut some more bread-and-butter--' 'But what did the Dormouse say?' one of the jury asked. 'That I can't remember,' said the Hatter. 'You MUST remember,' remarked the King, 'or I'll have you executed.' The miserable Hatter dropped his teacup and bread-and-butter, and went down on one knee. 'I'm a poor man, your Majesty,' he began. 'You're a very poor speaker,' said the King. Here one of the guinea-pigs cheered, and was immediately suppressed by the officers of the court. (As that is rather a hard word, I will just explain to you how it was done. They had a large canvas bag, which tied up at the mouth with strings: into this they slipped the guinea-pig, head first, and then sat upon it.) 'I'm glad I've seen that done,' thought Alice. 'I've so often read in the newspapers, at the end of trials, "There was some attempts at applause, which was immediately suppressed by the officers of the court," and I never understood what it meant till now.' 'If that's all you know about it, you may stand down,' continued the King. 'I can't go no lower,' said the Hatter: 'I'm on the floor, as it is.' 'Then you may SIT down,' the King replied. Here the other guinea-pig cheered, and was suppressed. 'Come, that finished the guinea-pigs!' thought Alice. 'Now we shall get on better.' 'I'd rather finish my tea,' said the Hatter, with an anxious look at the Queen, who was reading the list of singers. 'You may go,' said the King, and the Hatter hurriedly left the court, without even waiting to put his shoes on. '--and just take his head off outside,' the Queen added to one of the officers: but the Hatter was out of sight before the officer could get to the door. 'Call the next witness!' said the King. The next witness was the Duchess's cook. She carried the pepper-box in her hand, and Alice guessed who it was, even before she got into the court, by the way the people near the door began sneezing all at once. 'Give your evidence,' said the King. 'Shan't,' said the cook. The King looked anxiously at the White Rabbit, who said in a low voice, 'Your Majesty must cross-examine THIS witness.' 'Well, if I must, I must,' the King said, with a melancholy air, and, after folding his arms and frowning at the cook till his eyes were nearly out of sight, he said in a deep voice, 'What are tarts made of?' 'Pepper, mostly,' said the cook. 'Treacle,' said a sleepy voice behind her. 'Collar that Dormouse,' the Queen shrieked out. 'Behead that Dormouse! Turn that Dormouse out of court! Suppress him! Pinch him! Off with his whiskers!' For some minutes the whole court was in confusion, getting the Dormouse turned out, and, by the time they had settled down again, the cook had disappeared. 'Never mind!' said the King, with an air of great relief. 'Call the next witness.' And he added in an undertone to the Queen, 'Really, my dear, YOU must cross-examine the next witness. It quite makes my forehead ache!' Alice watched the White Rabbit as he fumbled over the list, feeling very curious to see what the next witness would be like, '--for they haven't got much evidence YET,' she said to herself. Imagine her surprise, when the White Rabbit read out, at the top of his shrill little voice, the name 'Alice!' CHAPTER XII Alice's Evidence 'Here!' cried Alice, quite forgetting in the flurry of the moment how large she had grown in the last few minutes, and she jumped up in such a hurry that she tipped over the jury-box with the edge of her skirt, upsetting all the jurymen on to the heads of the crowd below, and there they lay sprawling about, reminding her very much of a globe of goldfish she had accidentally upset the week before. 'Oh, I BEG your pardon!' she exclaimed in a tone of great dismay, and began picking them up again as quickly as she could, for the accident of the goldfish kept running in her head, and she had a vague sort of idea that they must be collected at once and put back into the jury-box, or they would die. 'The trial cannot proceed,' said the King in a very grave voice, 'until all the jurymen are back in their proper places--ALL,' he repeated with great emphasis, looking hard at Alice as he said do. Alice looked at the jury-box, and saw that, in her haste, she had put the Lizard in head downwards, and the poor little thing was waving its tail about in a melancholy way, being quite unable to move. She soon got it out again, and put it right; 'not that it signifies much,' she said to herself; 'I should think it would be QUITE as much use in the trial one way up as the other.' As soon as the jury had a little recovered from the shock of being upset, and their slates and pencils had been found and handed back to them, they set to work very diligently to write out a history of the accident, all except the Lizard, who seemed too much overcome to do anything but sit with its mouth open, gazing up into the roof of the court. 'What do you know about this business?' the King said to Alice. 'Nothing,' said Alice. 'Nothing WHATEVER?' persisted the King. 'Nothing whatever,' said Alice. 'That's very important,' the King said, turning to the jury. They were just beginning to write this down on their slates, when the White Rabbit interrupted: 'UNimportant, your Majesty means, of course,' he said in a very respectful tone, but frowning and making faces at him as he spoke. 'UNimportant, of course, I meant,' the King hastily said, and went on to himself in an undertone, 'important--unimportant--unimportant--important--' as if he were trying which word sounded best. Some of the jury wrote it down 'important,' and some 'unimportant.' Alice could see this, as she was near enough to look over their slates; 'but it doesn't matter a bit,' she thought to herself. At this moment the King, who had been for some time busily writing in his note-book, cackled out 'Silence!' and read out from his book, 'Rule Forty-two. ALL PERSONS MORE THAN A MILE HIGH TO LEAVE THE COURT.' Everybody looked at Alice. 'I'M not a mile high,' said Alice. 'You are,' said the King. 'Nearly two miles high,' added the Queen. 'Well, I shan't go, at any rate,' said Alice: 'besides, that's not a regular rule: you invented it just now.' 'It's the oldest rule in the book,' said the King. 'Then it ought to be Number One,' said Alice. The King turned pale, and shut his note-book hastily. 'Consider your verdict,' he said to the jury, in a low, trembling voice. 'There's more evidence to come yet, please your Majesty,' said the White Rabbit, jumping up in a great hurry; 'this paper has just been picked up.' 'What's in it?' said the Queen. 'I haven't opened it yet,' said the White Rabbit, 'but it seems to be a letter, written by the prisoner to--to somebody.' 'It must have been that,' said the King, 'unless it was written to nobody, which isn't usual, you know.' 'Who is it directed to?' said one of the jurymen. 'It isn't directed at all,' said the White Rabbit; 'in fact, there's nothing written on the OUTSIDE.' He unfolded the paper as he spoke, and added 'It isn't a letter, after all: it's a set of verses.' 'Are they in the prisoner's handwriting?' asked another of the jurymen. 'No, they're not,' said the White Rabbit, 'and that's the queerest thing about it.' (The jury all looked puzzled.) 'He must have imitated somebody else's hand,' said the King. (The jury all brightened up again.) 'Please your Majesty,' said the Knave, 'I didn't write it, and they can't prove I did: there's no name signed at the end.' 'If you didn't sign it,' said the King, 'that only makes the matter worse. You MUST have meant some mischief, or else you'd have signed your name like an honest man.' There was a general clapping of hands at this: it was the first really clever thing the King had said that day. 'That PROVES his guilt,' said the Queen. 'It proves nothing of the sort!' said Alice. 'Why, you don't even know what they're about!' 'Read them,' said the King. The White Rabbit put on his spectacles. 'Where shall I begin, please your Majesty?' he asked. 'Begin at the beginning,' the King said gravely, 'and go on till you come to the end: then stop.' These were the verses the White Rabbit read:-- 'They told me you had been to her, And mentioned me to him: She gave me a good character, But said I could not swim. He sent them word I had not gone (We know it to be true): If she should push the matter on, What would become of you? I gave her one, they gave him two, You gave us three or more; They all returned from him to you, Though they were mine before. If I or she should chance to be Involved in this affair, He trusts to you to set them free, Exactly as we were. My notion was that you had been (Before she had this fit) An obstacle that came between Him, and ourselves, and it. Don't let him know she liked them best, For this must ever be A secret, kept from all the rest, Between yourself and me.' 'That's the most important piece of evidence we've heard yet,' said the King, rubbing his hands; 'so now let the jury--' 'If any one of them can explain it,' said Alice, (she had grown so large in the last few minutes that she wasn't a bit afraid of interrupting him,) 'I'll give him sixpence. _I_ don't believe there's an atom of meaning in it.' The jury all wrote down on their slates, 'SHE doesn't believe there's an atom of meaning in it,' but none of them attempted to explain the paper. 'If there's no meaning in it,' said the King, 'that saves a world of trouble, you know, as we needn't try to find any. And yet I don't know,' he went on, spreading out the verses on his knee, and looking at them with one eye; 'I seem to see some meaning in them, after all. "--SAID I COULD NOT SWIM--" you can't swim, can you?' he added, turning to the Knave. The Knave shook his head sadly. 'Do I look like it?' he said. (Which he certainly did NOT, being made entirely of cardboard.) 'All right, so far,' said the King, and he went on muttering over the verses to himself: '"WE KNOW IT TO BE TRUE--" that's the jury, of course--"I GAVE HER ONE, THEY GAVE HIM TWO--" why, that must be what he did with the tarts, you know--' 'But, it goes on "THEY ALL RETURNED FROM HIM TO YOU,"' said Alice. 'Why, there they are!' said the King triumphantly, pointing to the tarts on the table. 'Nothing can be clearer than THAT. Then again--"BEFORE SHE HAD THIS FIT--" you never had fits, my dear, I think?' he said to the Queen. 'Never!' said the Queen furiously, throwing an inkstand at the Lizard as she spoke. (The unfortunate little Bill had left off writing on his slate with one finger, as he found it made no mark; but he now hastily began again, using the ink, that was trickling down his face, as long as it lasted.) 'Then the words don't FIT you,' said the King, looking round the court with a smile. There was a dead silence. 'It's a pun!' the King added in an offended tone, and everybody laughed, 'Let the jury consider their verdict,' the King said, for about the twentieth time that day. 'No, no!' said the Queen. 'Sentence first--verdict afterwards.' 'Stuff and nonsense!' said Alice loudly. 'The idea of having the sentence first!' 'Hold your tongue!' said the Queen, turning purple. 'I won't!' said Alice. 'Off with her head!' the Queen shouted at the top of her voice. Nobody moved. 'Who cares for you?' said Alice, (she had grown to her full size by this time.) 'You're nothing but a pack of cards!' At this the whole pack rose up into the air, and came flying down upon her: she gave a little scream, half of fright and half of anger, and tried to beat them off, and found herself lying on the bank, with her head in the lap of her sister, who was gently brushing away some dead leaves that had fluttered down from the trees upon her face. 'Wake up, Alice dear!' said her sister; 'Why, what a long sleep you've had!' 'Oh, I've had such a curious dream!' said Alice, and she told her sister, as well as she could remember them, all these strange Adventures of hers that you have just been reading about; and when she had finished, her sister kissed her, and said, 'It WAS a curious dream, dear, certainly: but now run in to your tea; it's getting late.' So Alice got up and ran off, thinking while she ran, as well she might, what a wonderful dream it had been. But her sister sat still just as she left her, leaning her head on her hand, watching the setting sun, and thinking of little Alice and all her wonderful Adventures, till she too began dreaming after a fashion, and this was her dream:-- First, she dreamed of little Alice herself, and once again the tiny hands were clasped upon her knee, and the bright eager eyes were looking up into hers--she could hear the very tones of her voice, and see that queer little toss of her head to keep back the wandering hair that WOULD always get into her eyes--and still as she listened, or seemed to listen, the whole place around her became alive the strange creatures of her little sister's dream. The long grass rustled at her feet as the White Rabbit hurried by--the frightened Mouse splashed his way through the neighbouring pool--she could hear the rattle of the teacups as the March Hare and his friends shared their never-ending meal, and the shrill voice of the Queen ordering off her unfortunate guests to execution--once more the pig-baby was sneezing on the Duchess's knee, while plates and dishes crashed around it--once more the shriek of the Gryphon, the squeaking of the Lizard's slate-pencil, and the choking of the suppressed guinea-pigs, filled the air, mixed up with the distant sobs of the miserable Mock Turtle. So she sat on, with closed eyes, and half believed herself in Wonderland, though she knew she had but to open them again, and all would change to dull reality--the grass would be only rustling in the wind, and the pool rippling to the waving of the reeds--the rattling teacups would change to tinkling sheep-bells, and the Queen's shrill cries to the voice of the shepherd boy--and the sneeze of the baby, the shriek of the Gryphon, and all the other queer noises, would change (she knew) to the confused clamour of the busy farm-yard--while the lowing of the cattle in the distance would take the place of the Mock Turtle's heavy sobs. Lastly, she pictured to herself how this same little sister of hers would, in the after-time, be herself a grown woman; and how she would keep, through all her riper years, the simple and loving heart of her childhood: and how she would gather about her other little children, and make THEIR eyes bright and eager with many a strange tale, perhaps even with the dream of Wonderland of long ago: and how she would feel with all their simple sorrows, and find a pleasure in all their simple joys, remembering her own child-life, and the happy summer days. THE END End of Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll *** END OF THIS PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND *** ***** This file should be named 11.txt or 11.zip ***** This and all associated files of various formats will be found in: http://www.gutenberg.org/1/11/ Updated editions will replace the previous one--the old editions will be renamed. Creating the works from public domain print editions means that no one owns a United States copyright in these works, so the Foundation (and you!) can copy and distribute it in the United States without permission and without paying copyright royalties. Special rules, set forth in the General Terms of Use part of this license, apply to copying and distributing Project Gutenberg-tm electronic works to protect the PROJECT GUTENBERG-tm concept and trademark. Project Gutenberg is a registered trademark, and may not be used if you charge for the eBooks, unless you receive specific permission. If you do not charge anything for copies of this eBook, complying with the rules is very easy. You may use this eBook for nearly any purpose such as creation of derivative works, reports, performances and research. They may be modified and printed and given away--you may do practically ANYTHING with public domain eBooks. Redistribution is subject to the trademark license, especially commercial redistribution. *** START: FULL LICENSE *** THE FULL PROJECT GUTENBERG LICENSE PLEASE READ THIS BEFORE YOU DISTRIBUTE OR USE THIS WORK To protect the Project Gutenberg-tm mission of promoting the free distribution of electronic works, by using or distributing this work (or any other work associated in any way with the phrase "Project Gutenberg"), you agree to comply with all the terms of the Full Project Gutenberg-tm License (available with this file or online at http://gutenberg.org/license). Section 1. General Terms of Use and Redistributing Project Gutenberg-tm electronic works 1.A. By reading or using any part of this Project Gutenberg-tm electronic work, you indicate that you have read, understand, agree to and accept all the terms of this license and intellectual property (trademark/copyright) agreement. If you do not agree to abide by all the terms of this agreement, you must cease using and return or destroy all copies of Project Gutenberg-tm electronic works in your possession. If you paid a fee for obtaining a copy of or access to a Project Gutenberg-tm electronic work and you do not agree to be bound by the terms of this agreement, you may obtain a refund from the person or entity to whom you paid the fee as set forth in paragraph 1.E.8. 1.B. "Project Gutenberg" is a registered trademark. It may only be used on or associated in any way with an electronic work by people who agree to be bound by the terms of this agreement. There are a few things that you can do with most Project Gutenberg-tm electronic works even without complying with the full terms of this agreement. See paragraph 1.C below. There are a lot of things you can do with Project Gutenberg-tm electronic works if you follow the terms of this agreement and help preserve free future access to Project Gutenberg-tm electronic works. See paragraph 1.E below. 1.C. The Project Gutenberg Literary Archive Foundation ("the Foundation" or PGLAF), owns a compilation copyright in the collection of Project Gutenberg-tm electronic works. Nearly all the individual works in the collection are in the public domain in the United States. If an individual work is in the public domain in the United States and you are located in the United States, we do not claim a right to prevent you from copying, distributing, performing, displaying or creating derivative works based on the work as long as all references to Project Gutenberg are removed. Of course, we hope that you will support the Project Gutenberg-tm mission of promoting free access to electronic works by freely sharing Project Gutenberg-tm works in compliance with the terms of this agreement for keeping the Project Gutenberg-tm name associated with the work. You can easily comply with the terms of this agreement by keeping this work in the same format with its attached full Project Gutenberg-tm License when you share it without charge with others. 1.D. The copyright laws of the place where you are located also govern what you can do with this work. Copyright laws in most countries are in a constant state of change. If you are outside the United States, check the laws of your country in addition to the terms of this agreement before downloading, copying, displaying, performing, distributing or creating derivative works based on this work or any other Project Gutenberg-tm work. The Foundation makes no representations concerning the copyright status of any work in any country outside the United States. 1.E. Unless you have removed all references to Project Gutenberg: 1.E.1. The following sentence, with active links to, or other immediate access to, the full Project Gutenberg-tm License must appear prominently whenever any copy of a Project Gutenberg-tm work (any work on which the phrase "Project Gutenberg" appears, or with which the phrase "Project Gutenberg" is associated) is accessed, displayed, performed, viewed, copied or distributed: This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org 1.E.2. If an individual Project Gutenberg-tm electronic work is derived from the public domain (does not contain a notice indicating that it is posted with permission of the copyright holder), the work can be copied and distributed to anyone in the United States without paying any fees or charges. If you are redistributing or providing access to a work with the phrase "Project Gutenberg" associated with or appearing on the work, you must comply either with the requirements of paragraphs 1.E.1 through 1.E.7 or obtain permission for the use of the work and the Project Gutenberg-tm trademark as set forth in paragraphs 1.E.8 or 1.E.9. 1.E.3. If an individual Project Gutenberg-tm electronic work is posted with the permission of the copyright holder, your use and distribution must comply with both paragraphs 1.E.1 through 1.E.7 and any additional terms imposed by the copyright holder. Additional terms will be linked to the Project Gutenberg-tm License for all works posted with the permission of the copyright holder found at the beginning of this work. 1.E.4. Do not unlink or detach or remove the full Project Gutenberg-tm License terms from this work, or any files containing a part of this work or any other work associated with Project Gutenberg-tm. 1.E.5. Do not copy, display, perform, distribute or redistribute this electronic work, or any part of this electronic work, without prominently displaying the sentence set forth in paragraph 1.E.1 with active links or immediate access to the full terms of the Project Gutenberg-tm License. 1.E.6. You may convert to and distribute this work in any binary, compressed, marked up, nonproprietary or proprietary form, including any word processing or hypertext form. However, if you provide access to or distribute copies of a Project Gutenberg-tm work in a format other than "Plain Vanilla ASCII" or other format used in the official version posted on the official Project Gutenberg-tm web site (www.gutenberg.org), you must, at no additional cost, fee or expense to the user, provide a copy, a means of exporting a copy, or a means of obtaining a copy upon request, of the work in its original "Plain Vanilla ASCII" or other form. Any alternate format must include the full Project Gutenberg-tm License as specified in paragraph 1.E.1. 1.E.7. Do not charge a fee for access to, viewing, displaying, performing, copying or distributing any Project Gutenberg-tm works unless you comply with paragraph 1.E.8 or 1.E.9. 1.E.8. You may charge a reasonable fee for copies of or providing access to or distributing Project Gutenberg-tm electronic works provided that - You pay a royalty fee of 20% of the gross profits you derive from the use of Project Gutenberg-tm works calculated using the method you already use to calculate your applicable taxes. The fee is owed to the owner of the Project Gutenberg-tm trademark, but he has agreed to donate royalties under this paragraph to the Project Gutenberg Literary Archive Foundation. Royalty payments must be paid within 60 days following each date on which you prepare (or are legally required to prepare) your periodic tax returns. Royalty payments should be clearly marked as such and sent to the Project Gutenberg Literary Archive Foundation at the address specified in Section 4, "Information about donations to the Project Gutenberg Literary Archive Foundation." - You provide a full refund of any money paid by a user who notifies you in writing (or by e-mail) within 30 days of receipt that s/he does not agree to the terms of the full Project Gutenberg-tm License. You must require such a user to return or destroy all copies of the works possessed in a physical medium and discontinue all use of and all access to other copies of Project Gutenberg-tm works. - You provide, in accordance with paragraph 1.F.3, a full refund of any money paid for a work or a replacement copy, if a defect in the electronic work is discovered and reported to you within 90 days of receipt of the work. - You comply with all other terms of this agreement for free distribution of Project Gutenberg-tm works. 1.E.9. If you wish to charge a fee or distribute a Project Gutenberg-tm electronic work or group of works on different terms than are set forth in this agreement, you must obtain permission in writing from both the Project Gutenberg Literary Archive Foundation and Michael Hart, the owner of the Project Gutenberg-tm trademark. Contact the Foundation as set forth in Section 3 below. 1.F. 1.F.1. Project Gutenberg volunteers and employees expend considerable effort to identify, do copyright research on, transcribe and proofread public domain works in creating the Project Gutenberg-tm collection. Despite these efforts, Project Gutenberg-tm electronic works, and the medium on which they may be stored, may contain "Defects," such as, but not limited to, incomplete, inaccurate or corrupt data, transcription errors, a copyright or other intellectual property infringement, a defective or damaged disk or other medium, a computer virus, or computer codes that damage or cannot be read by your equipment. 1.F.2. LIMITED WARRANTY, DISCLAIMER OF DAMAGES - Except for the "Right of Replacement or Refund" described in paragraph 1.F.3, the Project Gutenberg Literary Archive Foundation, the owner of the Project Gutenberg-tm trademark, and any other party distributing a Project Gutenberg-tm electronic work under this agreement, disclaim all liability to you for damages, costs and expenses, including legal fees. YOU AGREE THAT YOU HAVE NO REMEDIES FOR NEGLIGENCE, STRICT LIABILITY, BREACH OF WARRANTY OR BREACH OF CONTRACT EXCEPT THOSE PROVIDED IN PARAGRAPH F3. YOU AGREE THAT THE FOUNDATION, THE TRADEMARK OWNER, AND ANY DISTRIBUTOR UNDER THIS AGREEMENT WILL NOT BE LIABLE TO YOU FOR ACTUAL, DIRECT, INDIRECT, CONSEQUENTIAL, PUNITIVE OR INCIDENTAL DAMAGES EVEN IF YOU GIVE NOTICE OF THE POSSIBILITY OF SUCH DAMAGE. 1.F.3. LIMITED RIGHT OF REPLACEMENT OR REFUND - If you discover a defect in this electronic work within 90 days of receiving it, you can receive a refund of the money (if any) you paid for it by sending a written explanation to the person you received the work from. If you received the work on a physical medium, you must return the medium with your written explanation. The person or entity that provided you with the defective work may elect to provide a replacement copy in lieu of a refund. If you received the work electronically, the person or entity providing it to you may choose to give you a second opportunity to receive the work electronically in lieu of a refund. If the second copy is also defective, you may demand a refund in writing without further opportunities to fix the problem. 1.F.4. Except for the limited right of replacement or refund set forth in paragraph 1.F.3, this work is provided to you 'AS-IS' WITH NO OTHER WARRANTIES OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTIBILITY OR FITNESS FOR ANY PURPOSE. 1.F.5. Some states do not allow disclaimers of certain implied warranties or the exclusion or limitation of certain types of damages. If any disclaimer or limitation set forth in this agreement violates the law of the state applicable to this agreement, the agreement shall be interpreted to make the maximum disclaimer or limitation permitted by the applicable state law. The invalidity or unenforceability of any provision of this agreement shall not void the remaining provisions. 1.F.6. INDEMNITY - You agree to indemnify and hold the Foundation, the trademark owner, any agent or employee of the Foundation, anyone providing copies of Project Gutenberg-tm electronic works in accordance with this agreement, and any volunteers associated with the production, promotion and distribution of Project Gutenberg-tm electronic works, harmless from all liability, costs and expenses, including legal fees, that arise directly or indirectly from any of the following which you do or cause to occur: (a) distribution of this or any Project Gutenberg-tm work, (b) alteration, modification, or additions or deletions to any Project Gutenberg-tm work, and (c) any Defect you cause. Section 2. Information about the Mission of Project Gutenberg-tm Project Gutenberg-tm is synonymous with the free distribution of electronic works in formats readable by the widest variety of computers including obsolete, old, middle-aged and new computers. It exists because of the efforts of hundreds of volunteers and donations from people in all walks of life. Volunteers and financial support to provide volunteers with the assistance they need, is critical to reaching Project Gutenberg-tm's goals and ensuring that the Project Gutenberg-tm collection will remain freely available for generations to come. In 2001, the Project Gutenberg Literary Archive Foundation was created to provide a secure and permanent future for Project Gutenberg-tm and future generations. To learn more about the Project Gutenberg Literary Archive Foundation and how your efforts and donations can help, see Sections 3 and 4 and the Foundation web page at http://www.pglaf.org. Section 3. Information about the Project Gutenberg Literary Archive Foundation The Project Gutenberg Literary Archive Foundation is a non profit 501(c)(3) educational corporation organized under the laws of the state of Mississippi and granted tax exempt status by the Internal Revenue Service. The Foundation's EIN or federal tax identification number is 64-6221541. Its 501(c)(3) letter is posted at http://pglaf.org/fundraising. Contributions to the Project Gutenberg Literary Archive Foundation are tax deductible to the full extent permitted by U.S. federal laws and your state's laws. The Foundation's principal office is located at 4557 Melan Dr. S. Fairbanks, AK, 99712., but its volunteers and employees are scattered throughout numerous locations. Its business office is located at 809 North 1500 West, Salt Lake City, UT 84116, (801) 596-1887, email business@pglaf.org. Email contact links and up to date contact information can be found at the Foundation's web site and official page at http://pglaf.org For additional contact information: Dr. Gregory B. Newby Chief Executive and Director gbnewby@pglaf.org Section 4. Information about Donations to the Project Gutenberg Literary Archive Foundation Project Gutenberg-tm depends upon and cannot survive without wide spread public support and donations to carry out its mission of increasing the number of public domain and licensed works that can be freely distributed in machine readable form accessible by the widest array of equipment including outdated equipment. Many small donations ($1 to $5,000) are particularly important to maintaining tax exempt status with the IRS. The Foundation is committed to complying with the laws regulating charities and charitable donations in all 50 states of the United States. Compliance requirements are not uniform and it takes a considerable effort, much paperwork and many fees to meet and keep up with these requirements. We do not solicit donations in locations where we have not received written confirmation of compliance. To SEND DONATIONS or determine the status of compliance for any particular state visit http://pglaf.org While we cannot and do not solicit contributions from states where we have not met the solicitation requirements, we know of no prohibition against accepting unsolicited donations from donors in such states who approach us with offers to donate. International donations are gratefully accepted, but we cannot make any statements concerning tax treatment of donations received from outside the United States. U.S. laws alone swamp our small staff. Please check the Project Gutenberg Web pages for current donation methods and addresses. Donations are accepted in a number of other ways including checks, online payments and credit card donations. To donate, please visit: http://pglaf.org/donate Section 5. General Information About Project Gutenberg-tm electronic works. Professor Michael S. Hart is the originator of the Project Gutenberg-tm concept of a library of electronic works that could be freely shared with anyone. For thirty years, he produced and distributed Project Gutenberg-tm eBooks with only a loose network of volunteer support. Project Gutenberg-tm eBooks are often created from several printed editions, all of which are confirmed as Public Domain in the U.S. unless a copyright notice is included. Thus, we do not necessarily keep eBooks in compliance with any particular paper edition. Most people start at our Web site which has the main PG search facility: http://www.gutenberg.org This Web site includes information about Project Gutenberg-tm, including how to make donations to the Project Gutenberg Literary Archive Foundation, how to help produce our new eBooks, and how to subscribe to our email newsletter to hear about new eBooks. ================================================ FILE: examples/input_format/check_results.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import logging logging.basicConfig(level=logging.INFO) import pydoop.hadut as hadut import pydoop.test_support as pts def get_res(output_dir): all_data = hadut.collect_output(output_dir) return pts.parse_mr_output(all_data, vtype=int) def check(measured_res, expected_res): res = pts.compare_counts(measured_res, expected_res) if res: return "ERROR: %s" % res else: return "OK." def main(argv): logger = logging.getLogger("main") logger.setLevel(logging.INFO) input_dir = argv[1] output_dir = argv[2] logger.info("checking results") lwc = pts.LocalWordCount(input_dir) measured_res = get_res(output_dir) expected_res = lwc.expected_output logger.info(check(measured_res, expected_res)) if __name__ == "__main__": main(sys.argv) ================================================ FILE: examples/input_format/it/crs4/pydoop/mapred/TextInputFormat.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapred; import java.io.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; public class TextInputFormat extends FileInputFormat implements JobConfigurable { private Boolean will_split; public void configure(JobConf conf) { will_split = conf.getBoolean("pydoop.input.issplitable", true); } protected boolean isSplitable(FileSystem fs, Path file) { return will_split; } public RecordReader getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); return new LineRecordReader(job, (FileSplit) genericSplit); } } ================================================ FILE: examples/input_format/it/crs4/pydoop/mapreduce/TextInputFormat.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT // DOCS_INCLUDE_START package it.crs4.pydoop.mapreduce; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; public class TextInputFormat extends FileInputFormat { @Override public RecordReader createRecordReader( InputSplit split, TaskAttemptContext context) { return new LineRecordReader(); } @Override protected boolean isSplitable(JobContext context, Path file) { return context.getConfiguration().getBoolean( "pydoop.input.issplitable", true); } } ================================================ FILE: examples/input_format/run ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" module="wordcount_minimal" module_path="${this_dir}/../pydoop_submit/mr/${module}.py" job_name="input_format_test_job" jar_name="pydoop-input-formats.jar" wd=$(mktemp -d) javac -cp $(${HADOOP} classpath) -d "${wd}" it/crs4/pydoop/mapred*/*.java jar cvf "${wd}/${jar_name}" -C "${wd}" it opts=( "--upload-file-to-cache" "${module_path}" "--entry-point" "main" "--input-format" "it.crs4.pydoop.mapreduce.TextInputFormat" "--libjars" "${wd}/${jar_name}" "-D" "pydoop.input.issplitable=true" "-D" "mapreduce.job.name=${job_name}" "-D" "mapreduce.task.timeout=10000" ) [ -n "${DEBUG:-}" ] && opts+=( "--log-level" "DEBUG" ) if [ "$(hadoop_fs)" != "file" ]; then ensure_dfs_home input="input" output="output" ${HDFS} dfs -rm -r -f "${input}" "${output}" ${HDFS} dfs -put "${this_dir}/../input" "${input}" else input="${this_dir}/../input" output="${wd}/output" fi ${PYDOOP} submit "${opts[@]}" ${module} "${input}" "${output}" ${PYTHON} "${this_dir}"/check_results.py "${this_dir}/../input" "${output}" rm -rf "${wd}" ================================================ FILE: examples/pydoop_script/check.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import os import argparse from collections import Counter import pydoop.hadut as hadut import pydoop.hdfs as hdfs import pydoop.test_support as pts THIS_DIR = os.path.dirname(os.path.abspath(__file__)) DEFAULT_INPUT_DIR = os.path.join(THIS_DIR, os.pardir, "input") CHECKS = [ "base_histogram", "caseswitch", "grep", "grep_compiled", "lowercase", "transpose", "wc_combiner", "wordcount", "wordcount_sw", ] def check_base_histogram(mr_out_dir): output = Counter() for line in hadut.collect_output(mr_out_dir).splitlines(): k, v = line.split("\t") output[k] = int(v) exp_output = Counter() in_dir = os.path.join(THIS_DIR, "data", "base_histogram_input") for name in os.listdir(in_dir): with open(os.path.join(in_dir, name)) as f: for line in f: for base in line.rstrip().split("\t", 10)[9]: exp_output[base] += 1 return output == exp_output def check_caseswitch(mr_out_dir, switch="upper"): output = set(hadut.collect_output(mr_out_dir).splitlines()) exp_output = set() for name in os.listdir(DEFAULT_INPUT_DIR): with open(os.path.join(DEFAULT_INPUT_DIR, name)) as f: exp_output.update(getattr(_.rstrip(), switch)() for _ in f) return output == exp_output def check_grep(mr_out_dir): output = set(hadut.collect_output(mr_out_dir).splitlines()) exp_output = set() for name in os.listdir(DEFAULT_INPUT_DIR): with open(os.path.join(DEFAULT_INPUT_DIR, name)) as f: exp_output.update(_.rstrip() for _ in f if "March" in _) return output == exp_output check_grep_compiled = check_grep def check_lowercase(mr_out_dir): return check_caseswitch(mr_out_dir, switch="lower") def check_transpose(mr_out_dir): output = [] for fn in hadut.iter_mr_out_files(mr_out_dir): with hdfs.open(fn, "rt") as f: for line in f: row = line.rstrip().split("\t") index = int(row.pop(0)) output.append((index, row)) output = [_[1] for _ in sorted(output)] exp_output = [] in_fn = os.path.join(THIS_DIR, "data", "transpose_input", "matrix.txt") with open(in_fn) as f: for line in f: for i, item in enumerate(line.split()): try: exp_output[i].append(item) except IndexError: exp_output.append([item]) return output == exp_output def check_wordcount(mr_out_dir, stop_words=None): output = hadut.collect_output(mr_out_dir) local_wc = pts.LocalWordCount(DEFAULT_INPUT_DIR, stop_words=stop_words) res = local_wc.check(output) return res.startswith("OK") # FIXME: change local_wc to raise an exception def check_wordcount_sw(mr_out_dir): with open(os.path.join(THIS_DIR, "data", "stop_words.txt"), "rt") as f: stop_words = frozenset(_.strip() for _ in f if not _.isspace()) return check_wordcount(mr_out_dir, stop_words=stop_words) check_wc_combiner = check_wordcount def make_parser(): parser = argparse.ArgumentParser() parser.add_argument("name", metavar="NAME", choices=CHECKS, help="one of: %s" % "; ".join(CHECKS)) parser.add_argument("mr_out", metavar="DIR", help="MapReduce out dir") return parser def main(argv): parser = make_parser() args = parser.parse_args(argv) check = globals()["check_%s" % args.name] if check(args.mr_out): print("OK.") else: sys.exit("ERROR: output differs from the expected one") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: examples/pydoop_script/data/base_histogram_input/example_1.sam ================================================ foo_0/1 81 chr6 3558357 37 91M * 0 0 AGCTTCTTTGACTCTCGAATTTTAGCACTAGAAGAAATAGTGAGGATTATATATTTCAGAAGTTCTCACCCAGGATATCAGAACACATTCA 5:CB:CCBCCB>:C@;BBBB??B;?>1@@=C=4ACCAB3A8=CC=C?CBC=CBCCCCCCCCCCCCC@5>?=?CAAB=3=>====5>=AC?C XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_1/1 99 chr1 155858007 60 91M = 155858380 464 TGCTGTGCTTCAAATGGATAAAGCCACATTATGTCAACAAGAGGCTTGTTATCTTGGTAACCAGTTACCGTTTTTATGTCCATTCTGCCCT EEEEEBEFDFGGGDGGGGGGGDDDGGFGGGF?DFAFDBF?A,CC?B9.:?27;-:=A################################## XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:68A17C4 foo_1/2 147 chr1 155858380 60 91M = 155858007 -464 GTTTCATGCCATTCCCCTGCCTCAGCCCCCTGAGTAGCTGGGACTACAGGTGCCCGCCACCATGCCATGCAGAGCTTTTAAAAACACAGAT #####################################################??+@@>B@-FAEFE?EDEB5EDECC=8?BDDAC?=DDD XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:1G12T12T63 foo_3/1 99 chr7 148337326 60 91M = 148337704 469 CAACAACAACAAAAAAAACCCGGAAAAGTTCACAGAAAAACAGCTAATTTGCCAGAAGCTGCTGTCAACTGATGTCTATAAGCAGCACTGA GGGGGGGGGGGFGGGGFGGGGGGDGDEEECFFFFFGAGDGGFGFGG:EFGDBGEFE?B=?ECCBCBA@?AAAAAADDA:BCCCA?B=CB=9 XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:17C73 foo_3/2 147 chr7 148337704 60 91M = 148337326 -469 AAGGACCATGTGGTCTGTCCCAGCTGCTTAACTCTGCTGCTGCAGTGTGAAAGCAGCTGGACAGGGTGTGGACAAAGCTGTGTTCCAATGA #########@@C-@?C5-DDD-EDA:FFFFF?=?>4BB-DDEBADDDDDDC-FFAFB=EED?B;GFEEEEE=:EDDGGFDGFGFGGGGFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_5/1 99 chr2 157433580 60 91M = 157433946 457 CTGGGTAAAAGTAGCTGTAGATGTCATCTCATTTATCTGAAGTACCGTTTTCTCACATTAACTCTTTCAATTTCATTGCCACATCAGATCA GGGGGGGGGGGEGGGGGGGGFFGFGBGGFGDGGGGGGGGGGGBGGFGDGGGFGFGGFEGGGEGGGDFDAEGFBEEGEEGGGGFGGGGBGFB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_5/2 147 chr2 157433946 60 91M = 157433580 -457 GCATAAGGAAAAACACACCTAGACACACAGAGAAAGATAGCTAGATACTGAAGACAAAGAGAAAATATTAAAAGCAATCAGAGAAGAAAAG ?:=DE=E>@EEC?A?A=FE=EGDGGGGGGGGGGGGGGGGGGFGGFDGFDFGGGGGGGEGGGGGFFDGGGGGGEGGGGGAGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_7/1 81 chr17 72146167 37 91M * 0 0 TGATAAAAGCGATAATCCTCAGCCCCCTGAGTAGCTGGGATTACAGGCAACCGCCACCACGCCTGGCTAACTTTTGTATTTTTATTTATTG ################################################?5BGAGGDFFFFBFGEF=GGGGGFFGEGGG@GGGEEDGGGGGG XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:16A7T66 foo_8/1 99 chr13 75552481 60 91M = 75552867 477 GGTGAGGTTGCAGAGAAAAGGAACACCTTTACACTGCTAGTGGAAGTGTAAATTAGTCCAACCACTGCGGAAGACAGTGTGACAATTCCTC FFEFFGGEGGGGGGGGGGGGGBGDDGGGGEEGGGGGGGGEEGGGECACABFFEDFC@CECEBEEABCCCDBC:AB7;8;5CBB:=?BBBBG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_8/2 147 chr13 75552867 60 91M = 75552481 -477 CAGAAAAACAAACACTGCACATTCTCACCTATAAGTGGGAGGTGAATGATGAGAACACATGGAAAGATGGGGTGGGGGAAAACACATACTG ;GEGGFEBACCBEEEBFDFFFFEFFFEGFEGGGGGFGGGGGGGGBG=GGFGGGEDGGGFGGGEGGGGGGGGGGGGGGGGGGEGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_10/1 99 chr16 82273022 60 91M = 82273387 456 AGCAGGCACTGGAAGGAGAAACTTAGCTCGAAAAGAACTCTTCCTTCTTCTCCTCCTCCTCCTCCACAAAACAGGAATCAGGTGTCTCCTA GGGGGGEEGGGFCGDE:EEEFFFFDDBAFEEFFFBGGEAGDGEDDGGGGFDEGEBE=:EB?A>>CDEFGFFB:DFADAD=4A077C>CCAE XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:15A12G61G0 foo_10/2 147 chr16 82273387 60 91M = 82273022 -456 GTCTTCCATGATTATTCATAAGGAGGTGGGAAGAGGCGTTACTAGGAAGCTGTTCTTGGTGGTCTTCCGGGTGCACATGTGCAGCAGCTGT C?:AA?:=661C??C=B:AE;ECECA:EABEACC5CDBBD?AECEEGBEFECCA>B+CBAAC5=C:D;EEEEECA=DEFFFD?EADFEFFE XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:56G34 foo_12/1 83 chr9 21719826 60 91M = 21719457 -460 AAAGGAATGCTGGGCAAAAATTTTGCGAAACTGATGAAAGACACCAGAAATGTAAGTAATTCTTTGTACCCCAAGCAGGATAACTACAACG FGBFGGEEFGEEEEECE?EBEGGGGGEEEBBBGEGGGDEGEEBE=BFGGGFGFFGGFGFGGGGGGGGBGDGGGGGGGFGGFGEGGGGGGGG XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:26A42T21 foo_12/2 163 chr9 21719457 60 91M = 21719826 460 TAGATCAACAAATAATACAGATACTGAAGTTATCAAACACAGACTTTAAAACAGCTATGATTAACATTTTCAAGAAAACAGATTTAAAAGT FGGGGGGDGGGGDGGGGGBGGGGGGGEFGBGGFGGGGEGGGGAGGGGFGDGGGGGFGGGFGFEDFFEFFDDGDDBGF>>GGEFEGFFF?=1 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_14/1 99 chr8 77600428 60 91M = 77600797 460 CACTGTCCACTGTGAAAAATAATAAGAAAAAATACAAATTATAACGAATTTGCCTTCAGGAGGCCTTGTTAGACATAATGGGACAATACCT GGGGGGGGGGGGGGGGGGGGGGFGFEGGGGGGEGGGGGGGGGGGGFFGGEGGFGGEGGGGDEEEEEBEEEEEEEGGFFGGEG@EAGEGEEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_14/2 147 chr8 77600797 60 91M = 77600428 -460 AGACTTTTCTTCAACTTAAACTTTGCATTGAAATAAAAAAAGAATTGTACATGACATTATGGAAAATAACAAATCAACCCTTAAATCAGCA GG=GCGDGFFGGGGGGFDFEFDFGGGGGAFGGGGGGEEFFDDGGGDGFFFDFGGEGDGGGGGGGGGGGEGGGGBGGGGGGGGGGGGGGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_16/1 99 chrX 73888313 60 91M = 73888696 474 TCCTTCCCTTCCTCTTCTTTCCTTCCCTCCCTCCCTCTTCTTTCCTTCCCTCCCTCCCTCCTCCCTTCCTTCCCTTCCTTCCCTTCCTTCC GGGGGGGGG:GGGGGGGGGGEGFGGGDGGGD@6(@############################### XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_16/2 147 chrX 73888696 60 91M = 73888313 -474 ATTACATGTACTACAAAAAATTGACATGCAAGAATAAATTCTGATTTAAGTTTTGTGTGATACTGAGATGAGCCTGAGGGAACTCTTTAAT ?BADD:FFFFF>CDDGG?FFGGGGGFGGAGGGBGGGEFF5FGGDFGGGFGDFGGGGGGEFGGGEGGFDFF:FGGGGFGGGGFFEGGFEGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_18/1 83 chr14 44712230 60 91M = 44711860 -461 TTCTTAATTTGAAGTAATAAGAAATACTTTGTTTTTCTGTATAGTTCCTTGTGATCTCAGTAAAGAATGAGTTTTCTGCCTTAAGAGAAGA E:?ECDDC:EEAAAEEBC:CB9CA:B:5?A?;D?5?FEBEDEEEEEAB:DD=DD?CEB:EECACACB:EBEAAC?A=?AFFEEFEFFFFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_18/2 163 chr14 44711860 60 91M = 44712230 461 CTTTTTTTCTTCTGAGATTTTAGTTGGGAGTATCTAAATGGATCCAAATTATGTTAAGCTGATTATATGTATATTGAGTGTTTTTTAAAAT D?EEDEEEBEBCE=DA5CAADEEBD@;?>@DBD5=CDD:DFFFEFBEDCFDADEE=BEEEDF=FEF:FDEFFDFFAA?=C=DDA@>5;,@; XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_20/2 129 chr1 207736382 37 91M * 0 0 TTCAATGTCCACCTCCCAGAATCAAACATCTTCCCACCTCAGCCCCCCCAAGTAGCTGAGACTACGGGCACACATCACCACACCTGGCTAA ;<5;;@>C>@@,>;?99A9?>>.>@?-BB.5;2;AAAAC?3*37=27>111:3?7:9-<29<;>9=@>46)7/;59-09??:A5A@.;>B@ XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_21/1 83 chr7 50438238 60 91M = 50437868 -461 CTCAAACCAGAACTGTAAATAGTGATTGCAGGAATTCTTTTCTAAACTGCTTTGCCCTTTCCTCTCACTGCCTTTTATAGCCAATATAAAT BB?GFBGGGFGEGGGGGEGEGGGEEGGGGGGEGEGEGGGFFGFGGEGBFGGGDFGGFGGFFFBFCFDGECGFGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_21/2 163 chr7 50437868 60 5M1I85M = 50438238 461 GTCTTCAAAAATATATTTCCTCATAAACATTTGAGTTTTGTTGAAAAGATGGAGTTTACAAAGATACCATTCTTGAGTCATGGATTTCTCT GGGGGGDGGGGGGGFGGGGGGGGGDFGGGGGGF=GFGGGFGEGGGGGFFGGGGDFGGGGGFGGGDFGGFGGGGGGGEEFFAEEEBDBGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:90 foo_23/1 83 chr9 108075300 60 91M = 108074918 -473 GGCTGTGTATTTACAGGAAGACTTACTTGGCTGGTATTTACTTCATGTGGACATAAAATGTGGAAAGTAGGTACTGTATATAACTTTATTT GGGGGGGGGGGGGGGGD?EECFGFDFFGGEDDGEDFGGFFGGDGGGGGEGGGFGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_23/2 163 chr9 108074918 60 91M = 108075300 473 TGTAGTTGTACAAGTAGGAGAGAGTGAAGTTTCTATCAAGATTGTAGCAGTTGAGATAGAAAAAGGGAAACAGATTTTAGGTATATTTTGG GGGGGGGGGEFFFFFFFEFFDEFEBFFDFFGFGGGGGGFGBFEFFGFGGGDEEEEGGEGFGFEAGGGBGFGGG=GFFBABFDFDF?EEEEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_25/1 65 chr18 16773270 25 91M * 0 0 TCTACAAAAAGAGTGTTTCAAAACTGCTCTGTAAAAAGAAAGGTTCAACTATGTTAGTTGAGTACACATATCACAAACAAGTTTCACAGAA GGGGGGGGGGGGGBGGGGGGGGGGGGGGGGGGGGFGFGGGEGFGGGGFGGGGGEGGGEGFGGEGFGGFG=EGGGGBFFF>EEEEFGEF=EG XT:A:U NM:i:4 SM:i:25 AM:i:0 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:50C17C16T0G4 foo_26/1 83 chr6 83345667 60 91M = 83345296 -462 TTTTATCAGTTGAATAACAACAGAGCTCAGCTTATAACTAACTTATAAAACAGGCTCTATGGCACGATGTGGGTGAAGAGGATTCAGTGCA DCECB?EEEACDDDDDBCCB:EEEFE?FFFFEF5FFFGGGFFGDFGGGFFGGGFGGGFGGGF=FBFFFFDFFGGGGGGGGDGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_26/2 163 chr6 83345296 60 91M = 83345667 462 ATTCCTCTGGAGAGCAATGGTGTACATACTCAAGTCAATTCATTAGGACATGGGGCTGATGATGAAAGCATGTAACAAAGGAACTGCTGTG FGEEDEGFGGFGGFGEFFFFFEFFEEEBE=DDDDBFFDFB??DDDEE?CEGGFDDGGGDGGGGE:AEDE-AD=DCFF?6B:=BABDDB5A6 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_28/2 145 chr17 16958252 37 91M * 0 0 GTCTTGGCCATTTGTATATCTTCTTTGTATATTCACATCCTTTGTCTATTTTTAATTGTTTTTTGTTGTTGAGTTTTAGGAGCTGTTTATT BECGGGC?AA:DEEBBGFGFGGF?GFGEGGGGGGGGGADGGEEE?EF?GGFGGEGGAGGGFGEGGGGGGFGGGGFGGGGGGGFDGDGGGEG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_29/1 99 chr22 45659974 60 91M = 45660345 462 CTGTGATTAGTGGGATTAAAGCAGCACTGTGTGAAGCACGGAGCGTGGTGAGGACCCAGCCCTTCCTTAGTTCACCTGCGGGTATGGCAAT GGGFGFFFFFBFFFDFGGDGGGEGDGGDGBGEGGGFGGECFF?FEAEEAEEEEEBD@EE@EDBBDABBBBBBEBE=FFFFEB=A@>>>=5B XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_29/2 147 chr22 45660345 60 91M = 45659974 -462 AAGAGGCCTGATTCTGGAGATGCTCAGGTGGGTCATTAAATAATCACCCGGAAGGCACCGCAGAACCACGCCTGGCACTTCTGTGCACACT ####@35C@C?D-FFFFF:DFEDCECDE=EDEGDGFEDFFFABAC:EEEAFFFFFF?DBDDEEE:EE5EEBFFFFFBGFGGFGGGDFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_31/1 65 chr10 29460867 37 91M * 0 0 GTGGTTTGTTTGCTTATCCTGTATCTGGAGTTGAGAAGAAACCAAGTTTGAGATTTAGGATGTGGACAGGAGGTAGGAGTTTTGCCCATAA GGGGGGGGGGGFEGGDFFFEDCEEDDDD=DBEBBEEEDEEFEDAEAG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_32/1 83 chr7 79683300 60 91M = 79682929 -462 TACAAAAGAGGGGTTTAAACAGTTTTTTTCCAAATGAAATTTGAATGTTTTTCAGTTTTTTTATTGAAGCATTACATAGAGCATTACATTA GGGGFGGDGFFFFEFDEFDDGG@EEEFBGGGGGGGFGGGFBGGGFFGGGGFFGGGGDGGGGGGGGGGGGGGGGGGGGGFGGGGGFGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_32/2 163 chr7 79682929 60 91M = 79683300 462 TAGGAGATTCAGCACTGAAAAAGAATTTAAGACTCTATAAAGCAATATAACAAAAACTGAAATAGTTATTTCTAGGAGGAGGAAAAAGTTT GFGGGGGGGGGGADGGGGGGGDGFGGGGGGGFGGGGGGGFCFFFFGEGGGGGGGDCGGGGFFGGGGDGGGGGGBDEFFEFGGEGF=DDABE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_34/2 129 chr13 56061006 37 91M * 0 0 AGTTTGGTAGTTTTTACAGCCTTCAAGTTTTACAAAATATTTATTTGGTGAGGGCAACTTGACTAATCACCCTCAGGACCATGTTTCAGTA FFBFFFFFFEFFFFFBDEEEGGFGGGGFFFEEEEEFDFFBGGGGFECEFDEBEEDBBFFFDGGEDFF?FFGEGGBDD?DEFGG@DED==CF XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_35/1 99 chr1 184430571 60 91M = 184430964 484 TTCCAAACACACACATACACACACACACACACACACACACACACACACACACAAACTAATCCTGAGAGTGGCTGAATTACGACGCGAGTTA FEFFFFDFFFFGEBFFFDEFGGGFGGGFFDGEEGFFGGBFAGDGEEFFDFEE=BE-568.:<;?=8?;4*>>=>9A############### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:82T8 foo_35/2 147 chr1 184430964 60 91M = 184430571 -484 GGAATCCAATAACAAAAAAGAAACAAAGCATATATTTTATATATCTTTTTCATTTTACTTTTCTAGTTATTTGTATTATATTCTATGAAAG CEB5=EC??8>GGF>G@EE@:EBEEED=EA=GEFGGEFFFFEABEDEE??EEEEDECB?DDFFFFAE?EADGD:DEEGGGGE5GGGGGAGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_37/1 83 chr2 78122797 60 91M = 78122426 -462 TGTGATATTCTCATTTTCTCAGCCTTAAGAAGTCAAATGTCTCTGTAGGATAAGATATTCAAGGTAATATTTTCCTGTAAAGAAACACAGA ?B=EGGFEBFFAA?B5BEEGEEEBD?DEEGE?BGGGGDEFBFGEGDGGGGFGGGGDGFGGGGGGGGGGEGGEFDFAGGGGGGGEGGGGGGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_37/2 163 chr2 78122426 60 91M = 78122797 462 AAAGTAGTGTCACTAGGGACTTCAATAATCTATCTGAAGTACATATAGCAGCATCTGGATTATTTGCAATCTATTAGCTTCAGATCCAAAA EEEEDEEEEEGGGGGGGFGGFGGGFGFGFFGGGGGFE?EECDCDDFGDGGDGGEGAGGFDGDBGGGGFEGGGFGFAGEFG?;A-:BC=BCF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_39/1 99 chr10 111115054 60 91M = 111115440 477 ATTAAGGTTATGGGCTTGATGTTTCTAGTTGTGGTTGTGTTTCTGCCAGGATTTGGTATCAGAATGATGCTGGCCTCCTAGAATGAGTTAA GGGGGGGGGGGGGGGGGEGGGFGGGFFFEFFFEFFDDADBEEEEAEE=DE=BDCCDAA?A=C=AADB=CBC@EABA?AA-<088<>A:A-? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_39/2 147 chr10 111115440 60 91M = 111115054 -477 GTCTTGGCCTAAAAGCTCCTTCAGTTGATTAAGAACTTCAGCAAAGTTTTAGGATACAAAAATCAATATATAAATATTACTGTCATTTCTA FEDEDGEEFGGGGFFBEGGBEGGGFGGEGGGGGGEGDGGGGGGGGFGFGF?GGGGEDGGGGGGGGFEGGDGGGGGGGFFGGGGGGGGDGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_41/1 65 chr2 201662148 37 91M * 0 0 AGACTCTGTCTCTAAAAACAAAAAAAGTTAAAAAAAATTAGCCAGGCGTGATGGCGTGCACCTGTAGTCCCAGCTACTCAGGAAGCTGAGG C:C5A>>C9@@A@D?BBBBABBB>=>=>@?:?########################### XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_42/2 145 chr16 82783352 37 91M * 0 0 AGTAATCCTCCCGCCTCAGCCTCCCGAGGAGCTGGGACTACAGGTGGACTCCACTACACCTGGCTAATTTTTGTATTTTTTTTGTAGAGAT #########ABD@GGFGGEFFFDGGGFGF?GEGGGGGGDGGGGGGGFFFEFECEE?F@FFFGEGFGGGGGGFGGFGGGGFGGGFGGGDGGG XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:3G2A84 foo_43/1 83 chr2 4935961 60 91M = 4935586 -466 ACCACGAACATCTGACCTTGGCTGGAGTATAACTGTTATGATTACTTGAAAACCACCACTGCCAGGGCTGAAACCTTTTAAGTTACTCATC CFBDEEBBABBECC@ECECE@EDEEE?CEEADDEDFBGEGFEEEGGCEEEC:EECEGGGGFGGFGGGGGGGGFGGGGGGGGGGFGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:5A85 foo_43/2 163 chr2 4935586 60 91M = 4935961 466 AGAAACATCCCCAGTTTTTTCAAATGCATTGAGTGCTGTTAACCATTGTAGAGGATCAGACTGTTAAATTAGGAGGTAGATGCAATGTGTA GGGGGGFGGFGFGGFGFGGGGGGFDGBGGGGEGBGFFGGDGEGGGGGFGGGEGGFGGGEFFGGAGEFGFGAGGEGD?CECBCC@BBEAEBE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_45/1 81 chr8 43942323 5 91M * 0 0 CATTCGCAGAATCACGTTTGTGATGTGTGCACTCAACTGTCAGAATTGAACCTTGGTTTGGACAGAGCACTTTTGAAACACTCTTTTTGTA F?F?EGFFFBFDGEGGEFGEGEGFDFEEEAAEGFFEGFGBGGGGGEGEFFEGGFGGGGGFGGGGGGGGEAGGGGGGGGGBGGGGGGGGGGG XT:A:R NM:i:0 SM:i:0 AM:i:0 X0:i:7 X1:i:12 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_46/1 99 chr1 160898441 60 91M = 160898820 470 AGGAGTGAATTAGAGCCCCTTATGTTAAACCTTGTTTCTTGGAAGCCCTCCTGGAATTCCCTAAGATGCAGAGCTGATTAGCTTAGCTGGT GGEGGEGGFGGGGGGGGGGGGGGGGGFGFGGFGGGGGGGGEGGGGEEGEGFFD?EBFFFEGEBGE:=A@5CCCBCEFFFEDFBFFEDFAFC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_46/2 147 chr1 160898820 60 91M = 160898441 -470 ATCTGCAGGTTAGGTTGGACATTTACTTCTCTGCTTTTGCAGCAACTTCTACACCGTCTGTTCTCATACCACATTGTGTGGAAATTATCCA @:B=BA;AAA5:=D=CAA>?5DDC=DD=:?BB:AFFF?FFFAAAAAD?DCAFDFFFFGDEGEEEDEEBFFFGFGGGGGEBGGGGEGGFGFG XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:75A1A13 foo_48/1 83 chr11 88357992 60 91M = 88357628 -455 TCTGGAAAAACTATTTTCATACCCACAATCCTAAAGGTTAGATGCCATAACTCATGCTCCTACATATATAACCAATTTTGTTTATTTATTT EEEGEGGFECGFFFFF=???-@D??@FEDFF:GFGGGFGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_48/2 163 chr11 88357628 60 91M = 88357992 455 TCCAGGGCATCATATACTAAAGAATCATTCCATCTAATTTCTTCAAATCTCAAGAACACACACCTGGCAAATCATGAAAATAAATTTTATG GGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGFGGGGGGGGGGFBGGGGEGGEGGDGGGFGGFGFGGFDDGGGGDGGGGEGDGDGFEDG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_50/1 99 chr1 210048059 53 91M = 210048451 483 CCAGCACTTTGGGAGGCCAAGGCAGGCAGATCACTTGAGGTCGGGAGTTCGAGACCAGCCTGACCAACATGGAGAAACCCCGTCTCTACTA GGGGGGGGGGGEGEBGGGGGGGGCGDBBFDEDFEEFFEFECAEEB???:@BAACBEEDFE5=>@>C=B=5?@?:@A@7=C>>>>@EDD?5? XT:A:U NM:i:0 SM:i:16 AM:i:16 X0:i:1 X1:i:5 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_50/2 147 chr1 210048451 53 91M = 210048059 -483 TCATTAGCCACTAGGGAGATGAAAATTAAAAACATGACAAGCTACCTTTATACATTTACTAAAATGAGTACAATTTTAAAATATTGGCAAT ######F=E@ECCCCC:FFFBFF?BEAAEEE?EEEE?FEEBDCCCDDEEEFBCECEB@EE=EE=EEF?FBDFFFFEFAF:FEEEEBAB:CC XT:A:U NM:i:1 SM:i:37 AM:i:16 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:17A73 foo_52/1 99 chr1 210048059 53 91M = 210048451 483 CCAGCACTTTGGGAGGCCAAGGCAGGCAGATCACTTGAGGTCGGGAGTTCGAGACCAGCCTGACCAACATGGAGAAACCCCGTCTCTACTA GGGGGGGGGGFFGAGFBGGGGGFEGFFBED?EEEEGGDGGCFFFF>B2?=CCC?CEDEDGFEFBFFFFDDAEAECDD2CCCAACAGGBEEE XT:A:U NM:i:0 SM:i:16 AM:i:16 X0:i:1 X1:i:5 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_52/2 147 chr1 210048451 53 91M = 210048059 -483 TCATTAGCCACTAGGGAGATGAAAATTAAAAACATGACAAGCTACCTTTATACATTTACTAAAATGAGTACAATTTTAAAATATTGGCAAT 6?-BCBFFFBEDGGGGGGG?FGFGGFFFGGAGGGGGEEE:EEECEEGGGGGFFBBFFGGGGGGGFGGGGGFGFGGGGFGGGGGGGGFGGGG XT:A:U NM:i:1 SM:i:37 AM:i:16 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:17A73 foo_54/1 83 chr5 99043369 60 91M = 99043015 -445 GAACACAAAAAATTTTCATTATAATCATACAATCAACTGACAATTCACACATTAGAATTAGCAGAAAAGAAAATTAAACACTTATAACTGC =E=@;AEEEEDAABAA9;>8=>C>:6=EEEEE-DEEAAA@@=@DDDDD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_54/2 163 chr5 99043015 60 91M = 99043369 445 AGATAGCTCATTTCATGTGTTCAACTGAGTATGGATCAGCACGTATTACTGAGGTAAATGCCTAAGGCTGGGGAAAGAAGCACCCAATACG D?A5DBD=ABDDDDBFFEDFDB:???>C@@:AA?C=5B=:2676;77277@=C=@CA=:A>?@@;:-9A;72:(9:26A);B;8;AC>B## XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_56/1 99 chr5 61899391 60 91M = 61899789 489 CCACCTCAGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCACCCCGCTACATTTGCTCTTTTGATGCTGTCCCATAGATCTCGT EEEAE=DFFFGGEEF?EEBDFGFG?D:BDE?CAA=E?=AEC=DDD,DDCDC?C178:;*=;*7=/?>>64:844=C@-AC@C:?####### XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_56/2 147 chr5 61899789 60 91M = 61899391 -489 CCCCCCCCCTCAGCCTCCCAAAGTGCTAGGATTACAGATGTGAGCCACCACACCCAGGCTGACTGTATTTTCAAATATTCTGTCTTCTACG ##########?=/?;2=>,C@?DCAB<;97=CC<6@EE5EE?E?EEE=DEEFBFAFEEDEEE?EED?GEGFFGGBGGFDEGEEEED=DDDA XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:2A3A84 foo_58/1 83 chr3 121089753 60 91M = 121089378 -466 AAATTGTTTAAAGCAAAGTAACAACTGTTGCGGGAAGTCAGGGACCCTGAACACAGGGACCAGCTGGAGCCACGGCAGAGGAACATAAATT A<0;22B>:C>?9?EFDFFAE:EEA=:@CCA-DDEEBECECA?EEEBEEFEE?FGGFFGGGGGGGGGGGGGGGGGGGGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_58/2 163 chr3 121089378 60 91M = 121089753 466 TCTTCAAGAATGCCAGAAACACATTATAGACAGGGTATATACAGGGTATCAACAGTTGACTTCTTAACAGAAACAATGAAAGCCAGAAGAA GGGGGGGGGGGEBGGGGFGGGGFGGDFDGGFFFEC=AACADEEEAEBEDEEEEEEEGFEAFEGEEFBFEDFAFEFAEEDFGEBFEFBB:=E XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_60/1 99 chr7 91156329 60 91M = 91156701 463 ATGTAGACTTACACACACAGTTTAATCAAGTAATTGTGCCTATTTCTCAGTAATTTAAACTTCACATTTTTGCTTTCTTAAAATCTGAAAA DDGFGGFGFEGGGGGGGGGFGGGGDEFBFFFEA?FFEFEFGFFGAEDDD==ACC@AEEEEF=D?DBBCAB>;=;=DDDDCBCCE?A:5?:= XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_60/2 147 chr7 91156701 60 91M = 91156329 -463 TCCCACACGTTGTGGGAGGGACCAGGTGGGAGGTAACCTCATGGGGGCAGATCTTTCCTATGCTGTTCTCATGCGAGTGAATAAGTCTCAT ?CCB5=GF?GGGEGFFCFGGAEDECEE:DBF:FFFFFDDFFEEA=EEDB=EFDFDDDF=FFFFF?EDGGGGE:EDEED5FEBGGFDGGDGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_62/1 65 chr19 37869667 37 91M * 0 0 TGGGGGAAAGCTGTACATATTTATGAGGGGGTTGACCACATGTGCAATGGGTAAACATACAGGTAACATACATCAACCATGTTTATTTTGG :=--CCC?CC:.=?=DB:DA?@=@6=:?=-.&7=659;5;==:>=6*7:*;?23<('.54???5>>@4B?CCA@################# XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:75C15 foo_63/1 83 chr4 91585929 60 91M = 91585529 -491 AGCATGACATTCCTGTGGTAATATTGGGTTATTACAGTTACATTTGTCATATTTGATTGAATGCACAGAATGTTGCTATATATCTTGACTT FGDGGGEFEFFFFGFGEFGFBGGGDGGGFGGAFFGGGGGDGGFGGGGGGGGGGBGGGGGGGFGFGGGGGGFGGFEGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_63/2 163 chr4 91585529 60 91M = 91585929 491 TTTGGAAATGGTATTTCAACAAATACTTAAGTGGAGGTTTGTGATTGCAGTGATTATTTCAAGATCTTTGTGGACTGGGAGTCTTCCATTC GGGGGGGGGGGEGGGFGGGGGGGGGGGGGFFEFFEEECEEFEFDDFEFFFGGFFFGGGFGGGEDDDGFGGEAF?GGFG=EBBE5EEDE?EG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_65/1 83 chrX 80714354 60 91M = 80713989 -456 CATTTCATAATAAAAACACCAAGCAAATTAGGCATAGGAGAAACATACCTTTACATAATAGAGGCCATATGTGACAAACCCATGACTAATC 4<@2;/E>DEEE?EBEEE:EEDBA:A?=B?CFCFFFDBDC?@DDCAEEBEEFFAFFFFDBFFEFFFEEEEDEB=DEEEDCAEEDEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_65/2 163 chrX 80713989 60 91M = 80714354 456 AATTTTACTTAAATGAGTTTAAAAATGTGAAGAGGAGGGAATTCTTCCTGACACATTCTACAAGGCCAGCATTACCCTGATGCCGGAGGCA E=BDBDEDEBFFDFFEECEEACACCBDBDDEDEEE=BD>DCBDDDEDAE:EEE?:A?AAADFFDA=>6>>D==:CEEACBFD=AEBBDB## XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:89T1 foo_67/1 99 chr8 127325209 60 91M = 127325583 465 CCTTGAATTAGTTCAATGAATGAGAGTTACCCAAAACCAGTATGCCAGCACCATTCTGATATCTAATCTCACAGACATCATGATATACATA GGGGGGEGGGGGGGGGGGGGGGGGFGEGGGGGGFGGGGGFFGDGGGGGGGGGGDBG?GGGGFGGGFGGGGGGGEGGGDGGGGGGGBGGGGD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_67/2 147 chr8 127325583 60 91M = 127325209 -465 CACTCCCTCCCATGCTGTGCTGGGATTATGCTTATTATATTATGCTGCTAATATGGATAAAGTCATACTTTCTTGGTGTTTATCATAATAA C/C5@EGBGGGEGG?BGGGGFFFFD?GGGGGFGGGGGGGDFGFGGGGGGGGGGBGGFGGGBGFGDGGGGFFGGGEFGGGGGGGFGGGGDGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_69/1 83 chr10 88166322 60 91M = 88165954 -459 GCTTTTTAGTACATTCAAAGGCTTGTGACTAGGAAAGTAAAAAGCTCAGTCTCACTGGAAAGTGATAACTTCATTTCAGTACATCCCCATT DDDEBEDDFEADEBE?FFDGGG=GGGEDEEAGGDGGFGGGFEDDBFFFFFE=FGGGFGEFGGDGDGDDGGGEGGGGGFGGGGGGGGFGGGB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_69/2 163 chr10 88165954 60 91M = 88166322 459 TTACTTAGAAGGGGTCGGCAGAGGGTGCTGAATATTGAGTCATATCTGTTGCCTACTCTGTTGCCCAGGCTGTTGCCCTCACCCTGGTGCC GGFGGGGGBFDEDEEEGGAGE:AEE3D=DDEE5EEDF5FEBBDADCEDEEDEEEEGEGEGEFD?F=5@@??AAB################# XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:82T3T4 foo_71/1 83 chr9 91795542 60 91M = 91795154 -479 TACAGAAGGGTGTGTGTATTAGTGTGTATGAGGAATGTGTATATGGGGTATGTGTGTCATTGTGTATACGTGAGTGTGTATGTGAGTGTGA #############################################A584)5B5BBDB.?;==>>=B5-=?=DDD5DDDDDDB5DDDDD:=? XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:4T44G41 foo_71/2 163 chr9 91795154 60 91M = 91795542 479 CAGATCCATCCTGCTCCACTCTCTGTTTCCCCTTCCCTGGAAACGAGTGGCACCTCTGGTCCTCACAAGGGCCACACTCAGCTCTAGCGGC 3777,,3737;5;55=?>5=A==ACAA;?::CCCC:C::=>5-8;<@9;5AA>>>C:???DCAD=C:D?:4?################### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:29T61 foo_73/1 83 chr9 80693192 60 91M = 80692839 -444 CAAATGTATTAAAAATGGTTAAAGTGTGCTCTTAAAACAGTTGAATTATATGGCGTGGAACAAAAACAGTATGCCTCATATAGCTACTTTT EFGGFGBDGGGGGGGFFFFFBGGFFFGGGFFFGGGGGGGGFGGGEGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_73/2 163 chr9 80692839 60 91M = 80693192 444 ACAAAATGTGGTATATCCATACAATGGAATATAGTTGGCAATAAAAGGGATTGAAATATTAGTACATGATATGATATTGTTGAACCTGAAA GGGGGGGGGGGGFGGGGFGGGFGGGGGGGFGGGGGGDGGGGFGGGGGGGFGGGFDEGGGGGGGFGGGGFGEGGFEFEGGGGGGGGGGG?FE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_75/1 99 chr3 42286306 60 91M = 42286671 456 CACCCACTCTTCCCCATCATGGAATGAGACGCACCAGCTGGTCCCTGCCTTTATCTGGAGGACTGAGAGTGGAGGGGGCCATATCCAGATG GGGGGGGGGGGGGGGBGGGGFGGGGGFGDGFEGGGGGGGGGEEGGBFFDDGGGGGFGGDDF=EEE>?:C;AA=@AD############### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:76T14 foo_75/2 147 chr3 42286671 60 91M = 42286306 -456 TTCATAGAATTTAAGCTTGTTACAAGCAAGCCCCACTTACCAGAAGGCAGGAGGCTATTTGCTGTGATCAGAAACGTGATGGTTAGAAGGC E@FAF?GG?FGFF?AFFEFFB?DGGDGGGGGFFGFGGGGGFFGGGGGGGGFGGFGGGGGGGGGGDGGGGGGGGGGGGGFGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_77/2 145 chr2 88437391 37 91M * 0 0 TTATTGAGTAAAATTAAAATCTAAATAAATACAAAGACATACCTCCTTCATAGATTTGAACACTCAAGTTCATAAAGATGTCAGTTTCCCC 8CD?:CDEEGGGGEDGGGGD?GFCGEFFFDBAGFGGEEEECEEEDEGDGGFGGGGGFEFEFEGFFGGGGGGGGGGGGGGGFGGGGGFGEGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_78/1 99 chr6 31022292 60 91M = 31022683 482 AATTTTGCCAATTACTTAACCTCTTTGCGTCTCCTTTCCCCACCTGTAAAACAGTAAGATCACACTCCTCACTTCACAGGTTCATAATTAT GDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGEGGGGEFFEGGEGEEGGEFEEGEGEEGGFEGEEEBEECECFFFFEAEEEF XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_78/2 147 chr6 31022683 60 91M = 31022292 -482 TCCCCTCCCTTGTAGACCCTGCCTGTCTCATTGTAGAGCTCTAGAAATGTTGCCCATTGCTATTGTTGTGGGACTATGTACAGGTCACTTT #################EEEEEGGGAGGGGFGGGGGGGGGGGGGFFGGGGFDGFEGGGGFDGGGGGGGFGGGFGGGFGGGGGGGGGGGGFG XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:1G5A8A30C43 foo_80/1 99 chr3 65874913 60 91M = 65875296 474 AGCCAGCATAAACCCGGCGGGGACTTCTACCAACAGAGGGCCTCCTGTGAAAAGCCTGGAAAAAAAAGTCAACTCTCTGTTGCATAGTGAG FFFFFFFFFFFFFFFFFFFFFF0F=EBE=EACFFFDFFFFFDEFFFFEFEEE?E?FEFEAAEFEFCCCBBDFADBCBBC@FFBFEBEEB:E XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_80/2 147 chr3 65875296 60 91M = 65874913 -474 ACAGGTGCAGTCCAAATCCAGCCCACAGGCTACCAGTTTGAGAGCTCTGGGAAGGCCACCATCCTTAAGAGGATTTGAGCCTAATGTTCGA #################BA?>DD:5BEE=BEEGGGGEE:BEEFFF=F?GGEF?FFFEAEEEEFFEFDGGGGCGGDGGG=EGGFGGGGFGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_82/1 99 chr3 59620239 60 91M = 59620620 472 TTCGTGTCTTTCGAGCCCTGATGAGGGCATGGCTCCCTTAGCCAAGTCAGTACATCCTCTTCCCAGAGGAGAGCAACACTTTCCTGTATGG BEDBEEEDEEEE:DB:B??=DDAD=?;?DDA5::A######################################################## XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:66T1A22 foo_82/2 147 chr3 59620620 60 91M = 59620239 -472 CAGGGAAAGTGCTGGGCCTGTCATTCTGGCTGCCCGTCTTCTTCCCAGCCTTGCCTTGCTGGGAGACTTGACCTGTCCCCCTGCTCTCTGG #####################A:AA>977:+*4>=>-@6@@?+===5D?E=CA:5AD?:DB?5?<-5><9::55>=5995=@;@,BB?:A= XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:16A3G63A6 foo_84/1 99 chr16 18158667 36 91M = 18159047 471 GAGACCAGCCTGGCCAACATAGTGAAACCCCATCTCTACTGAAAATACAAAAAAAGTTAGCCAGGTGTGGTGGCAGGCACCCGGAAACCCA EAEEEEABEDAAC??:C?A=EECDE-;B=;A5A>A?BC5AEEC:BEEBBBC<7880=1=A>ACA########################### XT:A:R NM:i:3 SM:i:0 AM:i:0 X0:i:2 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:81T1T2T4 XA:Z:chr16,-16556437,91M,3; foo_84/2 147 chr16 18159047 37 91M = 18158667 -471 GCCTCCCCCAGCTTTTATCTCCTCAACAGTGAAGTGGGTTACAAGATTATTTTGCAAGTTAGAAATAATGTTTGTGGAACACACAGGACAG A:82C,AAA:B=6?@;D5BFEC:CCC::AAA>==@>EEDEE@B@@8>DB?DC-==CBAA;A>D;EE:=CEACEEDE;;59>B?5:<:A?A5 XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_86/1 99 chr12 23830502 60 91M = 23830869 458 CAAACTATTGTATGTAGCTATTTAAATTATAAAAAGTCCTTGTTTTAAACATACATTTTTCTTTTCAGGAATGATAATTACTGCTTAGATA GGGGGGGGGGFGGGGGGGGFGGGGFGGGFGGGGGGGEGGGGEGGFGGGDGGGFGFFGGGGFGGGGEBGGCGGGGGGGGG?GGGFGGGEGEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_86/2 147 chr12 23830869 60 91M = 23830502 -458 ACTATAACATCAAAGTATGACAAATCCACAAATGAGATACATTTGAAAGCCAAGTATTATAACATTTACTGTCTCTGAATTTTTATTAATA EDFDGGDGGDGGGGGFGGGGGGFGGFGGGGGGGGGGGGGGGGGGFGGGGFGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_88/1 99 chr2 103054674 29 91M = 103055053 470 AACCCCATCAAAAAGTGGGCGAAGGATATGAACAGACACTTCTCAAAAGAAGACATTTATGCAGCCAAAAAACACATGAAAAAATGCTCAC GGGGGGGGGGGGGGGEGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGDGE>EFFFGGGGFGEFGGGBE?E9EEAAGGGEGGGFGDCEFEGG XT:A:R NM:i:0 SM:i:0 AM:i:0 X0:i:34 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_88/2 147 chr2 103055053 37 91M = 103054674 -470 GCACACGTATGTTTATTGCAGCATTATTCACAATAGCAAAGACTTGGAACCAACCCAAATGTCCGATAATGATAGACTGGATTAAGAAAAT EABECEDFD;FFGGGFFDGGGGBEDGGGFGGGGGGGGGFGGGGGEGGGGFFGGGGGGGGDGDGGGGDGGGGGGGGGGEGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_90/1 83 chr12 116461306 60 91M = 116460928 -469 GCTAGAGGAACCTTCCAGATTATCTTGTCCAACTACATCTTTTCAGAGGTGAGACACAGCAAGAATGCAATAGACCTGGATCTTGACTCCA CABEEEBCEBEFEBFFFFGEEF?FBEF=FBEDFABFFEGFFGEFGGGGGFGGGGGGGGGGGGGGGFGFGGFGGGGGGFGGGFGFGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_90/2 163 chr12 116460928 60 91M = 116461306 469 ATTTTAATTTATTTAAATGTAAATAGCCACACATGGCTAGCAGATACCATGTTGAACAAAACAGATGTAGACTCTGGTTCCAACTGGCCAT GFGGGEGFGGGGGGEGFGGGGGFGGGGFFGGGDGGFGGGAFFFAFGGFEGGGE=?DEDED<:D?CEBCEEEDEDCCCAAB########### XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_92/1 99 chr19 9744682 60 91M = 9745053 462 AAAACCGACCATGCCCCTGCACTCTAGCCTGGGTGATAGAGCCAGACCTTGTCTAAAGAAAAGAAAAGAAAAAAAATCCATTAAATAATGC GGGGGGGGGGGGGFGGGGGGEGGGGGFGGGGBEDGGGGGFGGFGFGFGDGGFGGGEGGGEEEGCEC@DBDCBCDCGGGGGGFGFGGEGEGD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_92/2 147 chr19 9745053 60 91M = 9744682 -462 ACTTTCTTAAGATGATTGCCTGTACCTATAGCTGAGTGTGTGTGTGTGTCTGTGTGTGTGTGTACTGGAAGCATTAGCTAATACAATTCAG :4BBC==7?:4A?=A=AA:5ACCB-EEEDFC=AFFGFFFGGGDGGEGFGEGGGGGGGGEGFGGGFGGGGGFGGGGGGGGGGGFGGGGGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_94/1 83 chr14 38814826 60 91M = 38814461 -456 TTTGTTTGGTACAGGTTATGAAGTATAGATAGTACCTTGGAAGGAGTACATATTTGGATTAGAGTTATAAAAGATTTTGGAGAGGAAAATA FAGFGDGGGGGGGGGGGFGEGFGGGGGGGGGFFCEFFGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_94/2 163 chr14 38814461 60 91M = 38814826 456 AAGAAATATTATTGTACTAAAGGTAATTACGTGTTTGATCTGAGAGATGCTTTTGGAATTGCCTTTTAAAGTAGAGTGTCATAGGTCATAT FGGGGFGGGGGGGGFGGGGGFGGFGGGGGGGGGFGEGFDGGGGGGGEGGGGGGDGGGGGGGGGGGGGFBFGEGGEACEDEEFFFDDGDECE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_96/1 99 chr5 29693887 60 91M = 29694260 464 GGAAAAAAGTTAAAAATAGCATAAAAATAAAATTTAAAAAATTTGTGTTAAACATTAAAGATACAGAAACCTAATGGTTGGAAAATCAAAA D=:DDDAD?A;<<;;CA=A?DDDDDDCCCCA6>?D;;9;?=)B=?>C@@>6?>=@CC:?55;0990:?,1>?B2C@C6@@A5C=C:--?=? XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:41A17A5C20G4 foo_96/2 147 chr5 29694260 60 91M = 29693887 -464 TTTGTAAATTTTATGACATTTATGAAAACAGTGTTATTGAATGCAAATGTTTAGCATATGTTCATATTTTAAAGGGATAGAGACATGGAAA B5CEBC:?E?DEEBB@CCBC=ECBB:A?AA?:DA5DAECBC@CACCDCD?5ADC=?EBE=DB?BGAFFFFFD?C?D72/(52;C;C?@DDD XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_98/1 99 chr16 11003510 60 91M = 11003870 451 TCTGTGCTTCAGTTTCTTCACCTGTAAAATGAGGATGACACAGTAATATCTCATGCAGTTGCTATAAAGTTCAAGTGAGATCATATAGTCA D=DEEB=BBDEEEBEAEE?EEEEEBEEEDEDBEEEEE?EEDEEDBEEE?=?DB?-B=@@>B?D?5@CCC>6>@>>C:C=E:BDADDE??:A XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_98/2 147 chr16 11003870 60 91M = 11003510 -451 TCTGTGTTCATGGGGTAAAATGGCAGTGTTGCCACATTGATGTATTGCATTGTGGTGAAGTCAGGACCTTCATTGCATCCATCACTGGAGT #?5??A/-4*.-;7=07?<:4>B?5>?C@-@A:>A?=C=EEAC5CCBCD:CEDEBF@@:?C:8;6:A5DC?@B@AD=5:C5C?A5CBEC?E XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_100/1 99 chr8 109999747 60 91M = 110000128 472 GTTTTCATCTCCTTCACTTTCTTATTTAAGGCAAAAATCAATTCTGGTGAACTCCACATATTTCTTAGGCTATATAGTCACTCTTTCTAAA C==@>ACAA5@CACAEEEEEDEEBBEEEEEEEEBEEEEEEBE5E?A<:>:AA???B?EE=B?B@B?B@5B??=??>>3@C>:@@=BE???B XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_100/2 147 chr8 110000128 60 91M = 109999747 -472 TAAATATTCTTTATGACACTTAATTTTGTCTATCTGGGATTTAAAGAGATGTAGTCAGTAAGCAGTTGCTGTTCACAGGTCTTTTCTAAAT FD5:?E==@>;=@CB::CB:DE?EA:C?@?=C5C:C:DBDBBCDDDB?@@.DD:ADCAABABD?DDC:CAAD:DDEDBDDBADCADEF?ED XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_102/1 99 chrX 108127694 60 91M = 108128067 464 TCATATTCATCATTCTAAAACACAACTCTCATTGTGTCACTTAAGATCTCAGTTCCTTCAGTGGCTTTCTTTTGCCTATGGGATGCAATTC FFDFFFBFFBFFFEFFDFFEFEFEA=DDDDFFFDFEAFFDEEEEEFFF=FBD@?BABC:5>;BB:FF?@FC:;;:AB?AAC:CBC??B?EB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_102/2 147 chrX 108128067 60 91M = 108127694 -464 CAACCATCACCCGAGAAGTATACATTGAACCCAATTTGTAGTCTTTTATCCCTCACGCCCTTCCCACCCTTGACCACTGAGACCCCAAAGT ##@DBC@A=CC?5?-CC-CCA>@@;>>@9B=D?EEEE?EEBDD6DDC:4CAEEEEDD?D=C@3>DB=DE?EFFFFFDDDDB>;C@>==BB> XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_104/1 83 chr8 53287150 60 91M = 53286795 -446 GCCCATTAATGTGAAAAAAAAGTAATATAGGGTAAATCTTACCTGAGCTCCTGGATTAGTGAATCCATTGGCACTGTTGACAAAAGAAGAA GBDGDEEGFBFGGGGFGEGGGGEGGEEEEGGFEGGFDGDECGDFFFGGEGGEGGFGGGGGGGGBGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_104/2 163 chr8 53286795 60 91M = 53287150 446 AATGCAAGAAGTGGATAACTTAGGATCTGGGTTATCCATCAGTCAGGTTCATGGATAACAGCTCAGCTCTTCCCTGACCATCAGTGTGAAA GGGGGGGGGGGGGGGEGGGGGGFGGFGGEGGDGFGGGGGGCGGGGGGGEGGGGGGGGGGGGAGGGGFEGFG5GGBC-ACBCBCDC??5?CC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_106/1 83 chr12 119228405 57 91M = 119228013 -483 TAATCCCAGCACTGTGGGAGGCTGAGTTGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACATGGAGAAACCCTGTCTC CFD@FFC@CC@C,:BG=BEEFFEFB>CEEEEEEE5BEEEEEEEEEEEEEE5EDEEEEEEEBEEEEEDEEBEEEEEEEEEEEEEEEEBE XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:48C42 foo_109/2 147 chr8 104387743 60 91M = 104387376 -458 CCACTTTCCTATACAGTAGTCCATTGACATTCTCAAAGAATTTACTCAAGACTTTTGTAACCACTTAAACAGTATTTGCCCAGTGAAGTCT ;E@BEECEDEEDBEEEGFEFGFDEFFFGBGGGFDGGGGGGFEEEEEEFGGGFGGGGGFGGGGGGGGGGGFGGFGGFGGGEGGCFEGFDDEB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_111/1 83 chr5 78062243 60 91M = 78061858 -476 TATACCTACTGCTGTTGCCATCTTGAAATTCTTAAGAATTTTGAACAATTGGCCCTATATTTTCAGTGCACTTGGCCCTGCAATTATGTAG EEEBDEEEEEEEEEDDEEDEDDEEEEEEEEEDEEEEEEEEEEEEEEEEEEEEEEEEDEEEE?EEEEBEEEEEEEEEEEDEEEEEEE?EEEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_111/2 163 chr5 78061858 60 91M = 78062243 476 CTTCCCCCCTTTCCCTCCCCTATCCACTACCCTCCTCGGCCTCTGGTAACCATCATTCTATTCTCTATCTCCATGAATTCAATTATTTTGG GGDGGGFGGFGGGGGFGGGGGFGGGGGDGGGGGGGGGFCFGBGBEFFFFFGGGGFGGFE?GGGGGGFGGGFFEGGFEEFFEEDFEGGFGEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_113/2 145 chr18 58963478 37 91M * 0 0 GGTTGTGTCTGCAACGTGCATGACCAAGAGTGGCTACACACCACGGACAGAGCATAAGCCCTCAAAGAAGAGACCCAGGCGGATACAGGGG ####################################################################################?>=;6@6 XT:A:U NM:i:3 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:23A11C43G11 foo_114/1 83 chr8 109625334 60 91M = 109624953 -472 CTCAGATGATCCTCCCACCTCAGCCTCCTGAGTAGCTGGGACCACAGGCAGGTACTACCACATCTGGCTAATTTTTTATTTGTAGAGATGT ?479:4:;<'CA7CCA=A??# XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_120/1 83 chr15 57813470 60 91M = 57813098 -463 GGGTAGTTATAGTCTACAAAAGCCATATAGGCCATGCTAAAGATTTTGGATTGTATTGTAAGTATAATTTTTAGGTTGGGGGCTAGTGGTA EEDEE=FFGGDG??GGEFGDFGEFFEGGGGGEBGGGGGFDGFGFGGGGGDGGGGFGGGGFGGGGGGGGGGGGFGGGGGGDGGFGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_120/2 163 chr15 57813098 60 91M = 57813470 463 TTGCTCGCTTGCACTCTCTTGCCTTTCTGCCTTCCACCATGGAATGATGCAGCAGGAAGGCCCTCACCAAATGAAAGCCCCTCAACCTTGG GGGGGGGGGGDGFGGGDGEGGGGGGGFDGDGDGGGEGEDGFG?GGGFFDGAFEGGF=BDFDBF?ABEFC=GF?GG?FDFFFDEE=CBD5D-A839?=;;;>C5?,CCC>C*?=BB8>.6>17?############# XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:86T4 foo_128/2 147 chr13 99220942 60 91M = 99220555 -478 AGCCACTGCGCCCGGCCGGGTTTTAACTTTTCTTAACTGGCAGTTATTTTTTTTGAGACAGAGTCTTGCTCTGTTGCCAGGCTGGAGTGCA ######?B:>3:2<-;9;;+@E:EED;;@=@:??B5>;9;;?<<:@8>DDD??DDDA:A?C;CC6@A=::CCAED:EEDCDDD??BE:EEA XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:72A18 foo_130/1 83 chr2 67583140 60 91M = 67582783 -448 TTACTGGTAAAGCTGTGAGAAAACTTTCCAGACCTTTCTCTGTATTTGTGTGTTCTGTCAAATTGCAGCCACATTCAGCCTTCCCTTTCTC #####??-DC?5;??;88>?->250>*,96/7839/>5AB:CA?>:A?AA;8;??=A?5CFFFF?GGGDGGGFGGGGGGDGGFGG XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:0G31A58 foo_130/2 163 chr2 67582783 60 91M = 67583140 448 AATATTTGGTGGATGGCAGAAAGTGTCCCCTGTAACTAGATGTAGATTTTACTTCAAAAGAATCCCACTGAAGCAACCCTTCTGGGGTTAG GFGGGFGGGGGGGGGFDGGGEGGEGFFGGGFFEFFGGGGEGGDGGFGFGGDGGGGGGDGGGGDGGG?GFGEGDFFF?EFBFGDEEDFD5A= XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_132/1 99 chr4 100066554 60 91M = 100066938 475 TAAAAGTACAAAATTAGCCGGGCGTGGTGGCACATGCCTGTAATCCCAGCTACTCGGAAGGCTGAGGCAGGAGAATCGATTGAACCCAGGA GGGGGGGGGGGGGGFGGGGGGGGGFGGEGGDFEFFGGGGGEGGGFGGFEEGEGCGGGAFEFACECEFEEGC=C5@39:86?;5?<>?:5?# XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_132/2 147 chr4 100066938 60 91M = 100066554 -475 AACAAGACTCACATCTTGTATAACTTTGTATCCTAAAGAACACTTGGTACAACACCCTGGATATATCAGGAGCTATTAATATTTACTAACT C=DEEDFFEEEGDCGGGFFGEFDGGGGGGGGGGGGGGGGDEFGGGGEEEEDFF?FFGGGFGDGDGGGGGGGGGGGEGGFGGGFGGEGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_134/1 83 chr11 80126912 36 91M = 80126556 -447 ATGAGGGTAGATTTCTCATGAAGAGTTCAGCACCATGCCCTTGGTGCTGTCCTCGTGATAGTGAGGAAGTTCTTCTGAGATCTGGCTGTGT EFDFFFFDFCFBFEFFFBBFFC5CB:CEEBE@A=B=E?EEECCC5EFDEFBFDFFEEEE?EAFFFDFDFFDFEFFFDFFBFDFFFFFFFFF XT:A:R NM:i:0 SM:i:0 AM:i:0 X0:i:2 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chr11,+37231477,91M,0; foo_134/2 163 chr11 80126556 37 91M = 80126912 447 AACACAGATAGAAAAGGAGACAGTACCTATTAACTTGTCTTGGGGTTGTTAGTCTGAGTATTGTAAGAGATAAGGGCAGGTAAACGATGAG GAGDDGGGFGEEBAGDDEE=EDEEEEEFBFE5DEE=DBDDEEE=EEEEDA=C:CC=5B:9CDA:CEE5C5??CBDC?CAC########### XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:52A38 foo_136/1 99 chr5 94647738 60 91M = 94648127 480 TACTATTATTTATCTGAAATTTAAATTTAACTGGGTGTCATGTTTTTGTTTGCTAAATCTGGCAGCCCCATATGGAGAGGTTTTTGGTTTT GGGDFGGFGGGFGFGGDGEGGGGGGGFGFGGGGGDFGGGGGGGGGGFGEGGGGGGEGGGGGGGEGEEEECEECDE?D@BDACEEE5C8?@D XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_136/2 147 chr5 94648127 60 91M = 94647738 -480 TGAAATTGGGAGACATTAGAGGTCTTTTTTAATCGTGAGGGTAATGTGATGTGATTTACTAATACTAATGCCAGGTATTAATAAATAGCCT BE?FD=EFAFBGFFGFEFFFGFCGGGGCFCGGGGGGFGGFGGGGFGGGGGGFFGBGFBBFFDFFEFGGFEGDFEGGGGFGGGGGGGEFDDG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_138/1 99 chr3 173697337 60 91M = 173697711 465 CGGGAGTGGGACCCACTGAGCAAGACCACTGGGCTCCCTGGCTTCAGCCCCCTTTCCAGGGGAGTGACCAGTTCTGTCTCACTGGGGTTCC 7<772727:8BB=A>@@B3@<>B@@BBB=BAA5BBA=>BA3A<@9;.45@BAA>>BB25@############################### XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:0T66A23 foo_138/2 147 chr3 173697711 60 91M = 173697337 -465 GCACCCACTGCCTAACCAGTCCCAGTGAGATAAACTGGGTACCTCAATTTATGCAGAAATAACCTGCCTTCTGCATTGGTCTTGCTGGTAG ################CA>>A:5?A:>=;B4:@;6@::?CCC>AC-A:?=?>;5;*9=>BB5=CAAACA:5C?FFFA XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:1T89 foo_140/1 83 chr12 17517928 60 91M = 17517567 -452 GACCCTAAGTCTTTCCCCCATTCTTCCTTTCACTCTCTCAAAAAGGCCCTAGAGAGAGCTCCCACACTAGCACTCCCCGACTCATCCCATC #########################################AA:A->:5@AA>AA949A;,=);<@>>?>AD?:DDDDDD XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:14T76 foo_140/2 163 chr12 17517567 60 91M = 17517928 452 CATCTGCTTCTCACCTTATTCAATGTTTTGATGACCTTCTACTTTATAGCCCCTCCTATTAATCCTCCCTACAGGACACACTCGTGCGCCT DEBEEBFFDFEA:EDDABDDDE?EDCAEEAD5DDDFEE:E6=@@@=>@??C;;=-34<;)5=???AA6A+5*/(7.BB:;==B######## XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:59G9A17T3 foo_142/1 99 chr9 70554492 60 91M = 70554875 474 TGCCCTGAGGATGTTCAGCGGCATTCCTATCCACTAGATGCCAGTAGTTCCTTCCCCCTAGTTACAACAACCACAAATGTCTCCAAACATT BDDDBDDADDACCCAD=DDDBBDDDDDD:DDBDDDA?-CAACC=CDD?DDDDDD=CDDD@D?B?D?CDBDBDBD=?@@B>??ACAB-?AAC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_142/2 147 chr9 70554875 60 91M = 70554492 -474 CCATGCTCCTTTGTCACACTGTGTTCCATAGTTTCTGTTATAGAACTAATGGCCATTGAAAACATTTTCCAAACAGTAACTTCTGTTGTAA C5=C?C:*2'>;B,46@>,@@:-DDD@A=DDDC5DDC?-AAAB:A5A:DFB:?CCCCDD;DDD?DDB=@;;EAEE=DEE=?FF5EFBGFGD XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:7A14T68 foo_144/1 99 chr7 147690704 60 91M = 147691082 469 TAACAGGGGGACTTACTACTCGTGTGCCAAATTCAATGCTGCTAATAGGAAGCTTGAAAGCAAACAGTCTACGCTTTGGCAAGGAGGATCC GGGGGDGGGGGGGGGGGGGGGGFGGGGGEGGGEGGGGGGGGGGGFFGGGGGDFEGGGDGGGGFGGGEFGGGEFF?FFDBBEDEFCBECBCE XT:A:U NM:i:1 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:71T19 foo_144/2 147 chr7 147691082 60 91M = 147690704 -469 CGGCACCCCCCCCCTCCTGGGTTCAAGCGATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGATCACAGGCATGTGCCACCACACCTGGCTA ############??6@>>CC.B<>?7BC5BDADFFGFDFFDGDFGGBGGGGFGGDGGDGGGFFFEFEGFGBFEEFFECEEDFFFFFGGGGG XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:1T3A2T2A79 foo_146/1 83 chr17 12277024 60 91M = 12276657 -458 TAGATTTTCACGTGCAATTTTATAACTTTTTCATGCAAACTGTATGTTGGATTTGGCCAAGGGACTTCCAGTTTGTGATTTCTGGTTTCGA GEFF=DGEGEGGEGGEDBEEFFDECEEEEDEGEFEGGEEGGGGGGGGGEGGGFGGGFGFGGGGGGFEGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:43G47 foo_146/2 163 chr17 12276657 60 91M = 12277024 458 TTCCTTGCCAGTCTCCAGGAATACAATATAAAAGGGAATGAGAAAAAGGATAAGCAAAGGGCCGTATCAATTCTGTGCAACAGTGGAAAGT GGGGGGGGGGGDGGGGFGFEGGGGGGGGBDGGGFG@7BBCCCCCCFEGGGFG?GGGDEGBGGGG?GBGGGGGGGGD?GFBFEDCFGEFFEC XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:0C59A30 foo_148/1 99 chr7 104293902 35 91M = 104294284 473 ATAATAATATTAACCTTAAATGTAAATGGGCTAAATGCTCCAATTAAAAGACACAGACTGACAAATTGGATAAAGAGTCAAGACCCATCAG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDGGGGGGGGFGGGGGGGFGBGGGEGGGFFGGGFGGGEGGGGDGGDGGGFFFGGGGFBG XT:A:U NM:i:0 SM:i:12 AM:i:12 X0:i:1 X1:i:12 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_148/2 147 chr7 104294284 35 91M = 104293902 -473 ACTTAGACTCCCACACAATAATAATGGGAAACTTTACCACCCCACTGTCAACATTAGACAGATCAATGAGACAGAAAGTTAACAAGGATAT =FEEFFBFEFFGDGDFFFFFDGGGGGGGGFGFFFFCEEAEEGGGGFGGGGGGFDGGGGGGGGGGGGDGGGGDGGGGFGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:23 AM:i:12 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chr7,+57380374,91M,1; foo_150/2 129 chr12 29682556 37 91M * 0 0 TGAGGGTGATCGGCTGGACTCTAGGGAAAGACTACCTTCCCACTCCATCTCCTTTCTGGATCCCCCTCCAACTCACTGAGAGCTACTTTTA FDAEDDEDEE?5CACDDA=?;3*9A@4@@;6;>>==?948?;5 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_155/2 147 chr6 49608615 60 91M = 49608233 -473 CTTATGTTTACTGGAATTTATGGGTCTTCGATTTTTAGCAAGATTATTGTCATCATTTTTAATACTCTTTTGCAATGATACAAATTATTTC FE=EBFGBEDGGGDGAFFGGGGGGBFFGFGDG?GGGGGGGGGGFGGFFFFEGGGEGFGDFGGGCGGGFFGGGGGGGGGGGGGGGGFGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_157/1 83 chr13 40631296 60 91M = 40630926 -461 TCTGTTTAGGAACAAAAGGAAGGCCATTTTTGCATGAATCTGTTCCCAAGCTTTATGTTTACCTTTGGCATTGTGATTTGGGGGACTTAAG BGFEGGFEGGFGFGDGEDGFGGGDFEGEGGGGGGGGFGGGGGEEGGGGGGFFGGGGGEGGDFGGGGGGGEFFGGGGGGGGGGGGGGFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_157/2 163 chr13 40630926 60 91M = 40631296 461 TACTCCCACTTGGTAAGAAAGTCAAGCTTTCAAGGACATAAAACAAAATGAGAAGGAAACCTCATCCAATTTTATTTCAGGGGCCCACAGC GGGGGGGGGFGGGGGGGGGGGGGDFGGGGGGGGGGGGGGFGGGGGGFFGGDGGEGFGGGGGDGGGGFEDGGGGGGC@CFFCFDFFGGGFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_159/1 81 chr10 13256457 37 91M * 0 0 TACTCATTCTGAAGGAGGTCTTGTCCTCCCTTTGATAATCTGCGCCAGTGCTCTGTATCTTGATGAATATTCAGTAAACATCATTGCATAG FBGEGEDFGFGFEFFEGCEDEFE?BFEE?GFGDFGGGGGGBGGGGGDGGFGGGGGFGGGGGGFGFGGFFGFGGGGGGGFGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_160/2 129 chr12 117996522 37 91M * 0 0 GGACAGAGCAACTGAGACTCTGAGAGGTTAAACGGGGCAGAAGCACCATCTTTGTGAGCCAACAAGGGTCACGGTGTTTAGTATAAAAGAT GGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGFGGGFGGFGGGFGGGGGGGGGGGFGGGGGGGGGGFEGDEEECFEFBFGEGGGGFGBFG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_161/1 83 chr4 114789460 60 91M = 114789093 -458 GGCATTTGCGACTTCTAGTCTTCCTTTTTTGTCCATGTTCCCCCATAATGGAAATGTTCCAATTCTTAAAAAAATAATGTTTTCAAATCAG BDGGGGGGGFFGGGGFEGGGBFGGGEFGGGGFGGGGGEDGGGGGGGGGGFGGGGFGGGGGGGGBGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:63C27 foo_161/2 163 chr4 114789093 60 91M = 114789460 458 GCAAGTCTGATGGGATATACACAAATGAAAGTAGTTGAGCCAACAAAAGATACATGTTTGGGAAAGGCACAAGGGACCGGTAAGAAGTGAG GGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGFGGGDGDDFGFGGFGFEGGGGGEEGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_163/1 99 chr5 50568265 60 91M = 50568652 478 ATTATTTAGTTGTTGTATCCCCCAACAACGTTTTTAATATAAGGAAGTTATGTCTTCTAATGATTATTTTGATGCTACCCTAATGCAGGAG GGGGGGGGGGGGGGGGGGFGGGGGGGGGGGEGGGGGDGEGGFGFEDGDGEGGEFGGBE?DF?ADEBEEFC05=<=C?5BB@?B@@BBCE?A XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_163/2 147 chr5 50568652 60 91M = 50568265 -478 AATTTCAGAATTCCTCCATTCTCTTCAGGATCAGAACCTTGCCCTTCACTAGGGTCTTTGAACCTTACCAGGTCCTACAATACTCCCCCAT F?CCB:BFFFFBFFAFGGEEGEGFGGGGGFGGGGGGGGGFEGGGGFGGGEGGFGEGGGGGFGGGGGFGGFGGDGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_165/1 99 chr12 72989448 60 91M = 72989815 458 CCAAGGTCTTGCTGGGCAAATCCTACACTTGGCAGCAACTCTCTCCTGACCTATAGGGCTTAGAGCTGTCATTGTATTCTGTTACCTCCAT GGGGGGEGGGGGGGGGGGGFGGGGGGGGGGGGGGGFGEGFGGFDGGGAEGGGBGBEFGGFFDGCFBEFCEDDEFCGGGGGGECBDGGFGEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_165/2 147 chr12 72989815 60 91M = 72989448 -458 ATCACATAATAAATGATTTGCTTTATCCTAGAGTTTATGGTTCTATCATGCTTTTTTTTGTAAGTATCAATACCAGTGTTAGGAGCTAAGG EFGDGFGGGGGGGGGGGFGGFGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFEGGGGGGEGGGGGGGGGGEGGGGGGFFFGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_167/1 83 chr6 11798197 60 91M = 11797819 -469 ACATCACACAGCTCCTGTGCTTTGTTGATATTGAGTTCCAGAACTCAAAAACAGCCTTCAGTAGCCGGCATAGGCTCATTAGTCATCATTT 7>=9>;?;A?5:?B1>BB@,DFADFEB?CBE@=@?C:A?:AC>A?3@CCC?ECCE?FEEEEGGGGGGGFGGFGGGFGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_167/2 163 chr6 11797819 60 91M = 11798197 469 ACAAGCTCATGTCCTTTGCAGGGACATGGAAGGAGCTGGAAGCCATTATCCTCAGCAAACTAACTCAGGAACAGAAGATCAAATACCACAT GGGFGGGGGGGFGEGGGGGGGGGGGGGFGGGGGFGGGGGFGGEGEGGGGGGGGGFDGGDGFGGGEGGGGEFGDFGGB?BGEEE@EDECECB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_169/2 129 chr2 234871404 37 91M * 0 0 AGCTCACTTTTGGAGGAATTGAGATTAAGGGGTCAAAAGATTTCTTTTGCCTTAAAAAATATATATACTGCACTCCTAAACTGTTTGAAGT GGGGGGGGGGGGGGGGGGGGGGGGEGGFGFBF=DFFFFFF?FFFFGGGGGGGGGGGGEGFFGGGGFGGGBGGGGGGFG5GFDGEGGGGDGB XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_170/1 83 chr11 80258983 60 91M = 80258618 -456 CTCTGCCTCCCAAAATGCTGGGATTACAGTGTAAGCCACCACACCCAGCCTGGGATTTATTAACTTTCAAATTAATCCTCATTATTACACA ###################################B>::@ABB:C?BB?BBBACDBEE=EDEBEEEACCCCAAC?@C XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_170/2 163 chr11 80258618 60 91M = 80258983 456 TACAAGGGATGCTGCTCACCTTCTTACTTTAAAAATAAAAATCCTCACTGCAACTTTTAGTTCTAAGTCCCACGGCCCAGTCCTCACTCTA @@/4@/*4@@>>>.:6A.9346633D:DDCC=DA470,,/&.A:A4,-<5>>>>CEEB=EFFFF::.7<4803;:?C=5C>??:41<35;:A>>5?>>:? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_176/2 147 chr11 73438307 60 91M = 73437939 -459 AGAGAAGCTGATTCAGGGGATGGAGGAATTTAGTGATCCCAAAGCCATCAGAGAAGACAGATAATGAAGTTGAGAGGGACTGGCAACCAAA EGEEFEDEBEDGFFFGFGGDFFGGFFGGGGGGGGGFGFGGGFFGFGGGEGGGEBGGGDGGGGGFFGGGGFGGGEGGGGGGGGDGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_178/1 99 chr12 53797761 60 91M = 53798144 474 CTTTTATCACAGTAAAGATCATTTTTATACTATTCCAAATGTGGTTCTCTATTTAAAAAATAAATTTTATTCATTTATTACCATTATTTCC ;;CCB7+67@9=99;262*7CACACC;>@C=C6A?BCC6=AC>A8@@?1==>>=>4-AACC>A0@C?CCA)=ACB;+;;A5+>>=CC@;CC XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:76C14 foo_178/2 147 chr12 53798144 60 91M = 53797761 -474 ATGTGAACTCCATGCATTTTCAAACTTTTTTTTTCATTTGATGTAAATTTGAGTTCTGATAGAGAGTATCATGTAGACCAAGTTTTGGGTA D=?B=5A-BA?>B:?4:>:?5AAA8=B>5?776<6::BB8-;59< XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_180/1 99 chr13 73842639 60 91M = 73843030 482 TCTCGATTTATATCTGTAATTGTATCTCTATATTGTATCTCTATATATCTCCTATTGGTTCTATCTTTCGGGAGAACCCTAATACATAGAC GFDGGGGGGGGEEGGGDEGFDBDEBEDEEADFFFFFFGEGFGAGGGFEDDGGGGEGDDEEEFGEEEGGEBBBBBABCBCCEBGGFFEBB=E XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_180/2 147 chr13 73843030 60 91M = 73842639 -482 AATAATGCAGAGATAAGTGAAATCTCCCTTACCCAAATTTTTGTTTGATTTAGGCCTCCAGTGGATTGGCTGAAGCCTGCCATCCAATTTT FBCB?AB:CCBACBEBACADB?8AACAAAA-EE:EEAECEFE?EEEDEDECADDD?EEBEEFFBFFEEDBEGFD?GGFGGGEEEEAEEEEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_182/1 83 chr5 60820050 60 91M = 60819675 -466 GGCTGTAAATCCATCTGATCCTGGGCTTTTCTTTGCTGGGAGATTCTTTATTACCGATTCAGTGTTGCCACTCATTACTGGTCTGCTCAGG FFGEGEGFEEGFA:FFDCCFEEFEEEE=DEGEEAFFEGGEEEEFEGGGFEFFEGGGGGGGGGGFGGGGGEGFGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_182/2 163 chr5 60819675 60 91M = 60820050 466 CTGAATTTTACAAATGGTTTTTCTGTCTATTGAGATAATCATATGGTTTTTGTTCTTTATTCTGGTATCACATTTCTCAATTTGCATGTTA GGGGGGGGGGGFGGGGGEGGGGGGGGGGFGGGDGGFGGGGGGGGGGGGGGFGGGGGGGEGEGGGGGGGGGGGGGGGFGGGGGGGGGEGFFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_184/1 99 chr17 1496449 60 91M = 1496841 483 TTGAGGCCACACATCCTTCCACAGCCAGACCCCTAAGTCAGGCCTCTGGGGCGGCTGAGGCCCCCAGACAAAGGAGTTCGTGGAGGGCAGA DEE?EDDFFFFFFAFFFAFEBDE:?EDEEABEEEEFFFFFEFFFBFFF=FAAC:1<><::*=7B79;8>=3?4=)A############### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:74T16 foo_184/2 147 chr17 1496841 60 91M = 1496449 -483 AGCACAGCAGGGTGATGGGCAGAGAAGCGGGAAAGGGGATCCTGAGAAGGACCAGCCAGGGCCCTTCCTCCCTCCCCACACTCTTGCCTTT ###A>=?BA=D=?ABBDB:?D=D:D:A=AEEED=CCD?ACE>@C=@:CC=?DCCB?C?D?DACBA;>;:::B?ADEACECE4BC@=>AA?A XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:15C75 foo_186/1 99 chr15 42674703 60 91M = 42675069 457 AAGTAATCTCTTAAGCTCTGGAAAGAAGTGAAGCAACTATCACTACCTTAAAGAAAAGCTGGAAACCTCTGATGAGAGTTTTGCTCTTTTG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGDFGGGGGGDGGFGGGGFGGGGGGGGGGGGGGGGGGFGGGGGGGGGEGGGGGGGFGGGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_186/2 147 chr15 42675069 60 91M = 42674703 -457 GAAAAGCCTTCAAACATGAAACAAACTAATGAGCAACCAAGGGACTGTTCTAAACTGTCCTGCAGAGTGTTGCAGGTCACAAGACCAAGAA EGGEDAFFFEBGGFGDGEGEEGGGEGGGGFFGGGGEGEGGGGGGGGGGGGGGGEGGGGGGGFGGGGGGGGGGGGGGGGGFGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_188/1 99 chr4 171770955 60 91M = 171771342 478 TACTAATTTCTGAATTTTGGCTACTGTTAGAGTTATTTTATGTTATATATATAATCTGATTCTGTTGATTTTATCTAGTCTCATTACTTTA GGGGGGGGGGGGGGGGGGGGGGGGEDEE?BBD@C@FFFE?EEFFDFFGEGFGDFFEF?EEGEFF?C?CCDEEAECD5C@CE?CCEB?EEE@ XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_188/2 147 chr4 171771342 60 91M = 171770955 -478 TGGATTGCAAACCTGCTGCTAACCCCTCTCTCCCAAACTAATGTTTGTATGTTTATATTGATGTCGGTATATTGATGTTAATTCCTCCCAC FBBBCB=@C@:FFFFGGGGFGDFEFFFGEGEGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_190/1 83 chr7 47698451 60 91M = 47698082 -460 TTCAAAGAACTAATGAATTAAAATAACTTTCTGTTTCTCATCTTAACTGTTCTTTCCCTGACTGTAGAATTATTATTTGGAAAGTGGAAAG G:AEECBEB=DDFFEFBE>>BCBCBA=BEEBFDDFFEABAEDFFDFCBDB=FFFBDFGGGGFFFFEEFFDEGBDGGGEGFGGGFFGFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_190/2 163 chr7 47698082 60 91M = 47698451 460 TTTTCCAAGAGCAACACATAATTTTTAGTCATGGTTTGCTCATATTAAACATATGGTTACTGTATTACTAAAAAATATCAAAGTAATTTAT GGGFEEGFFEEEDEEEEEDEDEEDEEC?BEEFEEEGGDGFCCDAEDBDDD=ECEDDEDB=:5:A5==.:4ACAAA################ XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_192/1 99 chr2 147405447 60 91M = 147405814 458 GACAGAGCAAAACCCCGTCCCAAAAAATAAAAAAAAAGATTCACCAGGTGAATTAAAGCCTGAATAAGAATATATAAATATAGTTATAAAA GGGGGGGGFGFGGGGGGGGGGFGFGGGFGGGGGGGGGFEEGFFGGFDFCFBGGGEGG=BGFFBGEGGGGGEGGGEGGGFGFFGEGGFDGDG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_192/2 147 chr2 147405814 60 91M = 147405447 -458 GAAATATACGACTGGCAGAAGGGGTGAAAAATCAAACTATAATTCAACAATTGAAGACCATGTATATTAATATTAAATATTACAAAAGCTT FGGGGGGDGGFGGGGGGGGGGGGDEGFFFFAAGFGGGGGGGFGFEGGGGGGGGFFGGGFGGGGGGGGGGGGGGFGGGGGDGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_194/1 99 chr8 68921390 60 91M = 68921766 467 AGGGAAGAGGCAGAAGCAAGATCAAGCCAAGTCAGGAGTGGAGCTGCAGTGCTGGCCCCAGTGCCTTAGCTCTTGACATAAAATAACACGG GGGGGGGFGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGEGGFGGGGGGFEGGGGDEGGGGDDGEGBGEGEEGBGBBGDGGGGGGEECEFFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_194/2 147 chr8 68921766 60 91M = 68921390 -467 GTCCTGCAGGAACTGCCTCTAGAATTTTTGGTTACAAGCCACATTCTTAGCACCCTTTTGCTCTTCTGTGCTTACCCCACAAAGGTCACTG ###################################BGGGGGGGGGGFFFFFEEGGGFGGGGFEGGGGGGGGFFFGGGGGGGGGGGGGGGGG XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:2A12A8A66 foo_196/1 65 chr1 94930504 37 91M * 0 0 TATAATAGTGATTCGTCTTAACCAGGGAGACAGTGTCATGTAAGAGTTCCTGTGTTCAAATCCTTGTTCTGCAACTTACTTGCCATGGAAT GGGGGFGGGGGGGGGGGGGGGGGGGFGAGFFFFEFFGGFGGGGGGGCGGGFFDFDGGGGEDGGGGGEFEGEEGEEGGFGGGGDGDFBFDFF XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_197/1 83 chr8 141740825 60 91M = 141740453 -463 ATCAACACAGATTTGGGAGGCATCAAATACTTGAATCTATGAGAATGAGAAGAACTACCAGAGTGAAAAAGTATTTTTTAATAAAACAAGT FDGGDFEFGGFFGGGGGBGGGGFGGEGGFGBGGGFGGEDFGGGGGEGGFGGGGFGGEGGGGFGGGGGGGGGGGGFGFGGGGGGGGGGEGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_197/2 163 chr8 141740453 60 91M = 141740825 463 AGGACTTGGCAAGAGGGATAAGAGATTTTTAAGAAGGAGAATGACAAAATCTAGAGAGATATTCTAGAAAGATGGCTCTTGGCGGCAATGT GGGGGGGGGGGGGFGGGGGGGGGGFGGGGGGGGFGGGGGGGGGGGGGDGFGGGFGGGGDGGFGGGGFGGGGGGGGGEGFGGGGGB@=A@@1 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_199/1 83 chr15 42000147 60 91M = 41999776 -462 GAGTTTGATTATCCAGAGGTAGCTTAACATTGCCAGATGTAAGCTCCATGAAGGCCCAGAACTGTCAGTCTCGTTTACCACTGTATCACCA GGDEFGBFGFGADEEDEEGBEFDGFGEGFGFEGFEGGGGGGFGGEGGGGGFGGGGGGFGGGGGGFGGGGFFGGGGGGGGGGGGGGGFEFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_199/2 163 chr15 41999776 60 91M = 42000147 462 GAAGTGGAAATTTCCTAAGTCAGGGATGGCTGGTATATATTCCATCTTTCTTTGAGTAACATAAATTGCCAACCCTTGTGTTAGAAAACAC GGGGEGGGGGGGGGGGGGGGGGGGGFGGGFDEE?CBABBCGAGFGEFFFADDC>BFDEFEGGFFEEFEDFDEEGGGFFEECFBEBEFECAB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_201/1 99 chr2 55571418 60 91M = 55571787 460 TGGCAGAATTCCAAGGGAAGGTTCATGAGCAGCTGTTGCCAGGGTCCCCAGAGCTTCCTTAGCTTCCGCTCTTCATGGTTGGTTGTTCAGT FEADFFFFFFFFFFEEFFFFF:FFFDE?EAFFFFDFDFFDE5EEAAACCC:DFEEBAAFEB@B?C-A>9@CDEB=E:C: XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_203/2 129 chr14 71031940 37 91M * 0 0 TGATGCTCCCTCATTTATTCCTGATTTTGGTAATTTTGGGGGGGATAATTTTCTTTTTTTCTTGGTCAATCTAAATAAAGCTTTGTCTATT GGGGGGGGGGGGFGGGFGGGGGGFGFFFFFEGGGGGGGGFGGGGDGGGGGGGGGGGGGGGGGGEFGGGGGGGGGGGGFEGBGGDGGFGEFF XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_204/1 83 chr3 43677026 60 91M = 43676644 -473 GAAGACAAGATTAGAGAAAAGAGAGTGAAAAGAAACGAACAAAGCTTCCAAGACATATGGGACTATGTGAAAAGACCAAATCTATGTTTGA ######?A:AA*>996?AA:A@5???08E XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_208/2 129 chr6 125609829 37 91M * 0 0 ATTCCATTCTGGTCACAAAAAGAATACCACATTAATAAAGTTATTTCTGTCCTAGGGAAGACCAAGGCACATATTTGAAAATATTCACTAG 5-5<;,76<;6,:6,66762211.(;;<-<:=?.>5<;9:09,7:227:6>==:5-=9<;DD:ADEEB5?=5=?=EEEBEDFEFAD5A=DA XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:11C79 foo_209/1 99 chrM 4732 46 91M = 5123 482 AATACTACCAATCAATACTCATCATTAATAATCATAATGGCTATAGCAATAAAACTAGGAATAGCCCCCTTTCACTTCTGAGTCCCAGAGG GGGGGGGGGGGGGGFGGGFGGAGGGFGGGDGFGGGGFGGGGBGFFDEEFEFF?ACEFAFDEDADFBEEFBDFECECEBB@CB=5BEBCCAE XT:A:U NM:i:0 SM:i:23 AM:i:23 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chrM,+555144,91M,1; foo_209/2 147 chrM 5123 46 91M = 4732 -482 TACTCAACTTAAACTCCAGCACCACGACCCTACTACTATCTCGCACCTGAAACAAGCTAACATGACTAACACCCTTAATTCCATCCACCCT @5><:6ACBADA5A:CC=CC:CD@DD?CCACBEA:EEE5EEEBEBEFFFFFGFGFGGGGGEEEEEDEEEE5EEEEEGGG?FEEEEE?DDDD XT:A:U NM:i:0 SM:i:23 AM:i:23 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chrM,-555535,91M,1; foo_211/1 83 chr5 74871224 60 91M = 74870845 -470 AGACAGGCGTCTTACTCTCTTGCCCAGGCTGGTCTCAAACCCCTGGGCTCAAGCAATCCTCCTGCCTTGGCTCCCCAAAGGGCTGGGATTA CE?EDF?CEBEC@?B=DBEDCBFED@EEEEEDADDEGEEGGGFEGGEGEFGEEEEECEE@EGGDGGGGGGGDGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_211/2 163 chr5 74870845 60 91M = 74871224 470 TTTATCATGTTTTGGCAAGTCCCGAAGCAGTATTTTTGGGATGTTTAATTTTTTTGTTTCTGGTATTTAGGCATGTATGATATGACTAGGA GGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGFGGDGGGGGGGGGGGGGGGGGGGGGFGGGGGFFGGGGGFCGFEGGGFDGFGGG: XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_213/1 99 chr9 20628481 60 91M = 20628845 455 AATTAGAGACCGCCCTGGGAAACACAGAGAAACCCCGTCTCTACAAGGAATTTTTTTTCTTTTAATTAGCTGGGCATGGTGGTGTGCACGT GGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGFGFG:GGGEGEGGGGEFFGGGGEEE=GGDGGDGGCFDFFFFF? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_213/2 147 chr9 20628845 60 91M = 20628481 -455 ATCCAGTAAGGATCAGGGAGCATGCATTTTCCAAACTGTCTCCAACAATAGCAAATAATGGTCAATGGCTGGATATTCTAAACCAATGTAA 5:A=?@?C:CC:DDBGEGGGGGEGGDGGGBGGGGDGGGGGGGGGGGGGGGGGGGGGGFFGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_215/1 83 chr5 9438690 60 91M = 9438325 -456 GGACAAAGGAGTCAGGTGAGTGAGCTGACCTAGTGAGGTGGCCTCAGAGATGTCTCAGGAAGTCCTGTCCCATGCATGCCACAGCAGGACA CE?AEEEDEBCBBFFFBACB=BBBCB@?D@=AEBC?EDDGEGGGCGFGGGGGGGGDGGGGFDGGEGGFGGGGGGGGGEGGGGGGGGGFGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_215/2 163 chr5 9438325 60 91M = 9438690 456 GTGTTTGTGCAATAAATAACATTTCTCATTCACTCCCACGCTACAACTGAAACTGAATAGGCTTTGGGGACAGAAAATAACTGCTCCTTTT FGFGGGEGGGFAEGDGGGFDGGDGGFGGGGGEGGGEF=FDDEDEEFEFFFEED:EEDEE5EEEEBF?F=EGGFDAFFEF>:DDDBEC=GDF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_217/2 145 chr8 28899072 37 91M * 0 0 AGATCACTGCCTTTTTAACTTATCTTTAAAATGTAGGATAAAACCACTGCATATTCTATTTAATTATAGCAATTTCTTTACAAAGGTAAAG EGGDGFGGGGGG=FGGGGGGGGGGBGGGGGGGGGGGGGGGGGGGFGGFGEGGGGGGGGGGGGGGFGGGGGGGGGGGDGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_218/1 99 chr1 206890991 60 91M = 206891385 485 CCTTAGGAAAATAGAGAGGGCTTTCTCTTATAAATATATTTTTTAAACAAGTAACATCCAGAAAAAGCAAACCCATCCTTCGCTTGCTATG GGGGGGGGGGGGFGEFFFGFGGFEGDDBEDGFG:DGGFGGGFGFEC:EEEGEEBGEF@F:CABAA3?>C5A@B@@AEEEEGGFEFFBGEFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_218/2 147 chr1 206891385 60 91M = 206890991 -485 AATTCTACTGTCAGATGACCCTATTCATTAATTCATTTTCTCCCTGAAGACTATTGAATATTTAAGCCAGGAACCTGAGATGTACCCAGCG D?DB:D?FFF?FDEFFC?C55BDGGGGGEGGE?AEDGGAGGEEEEEEFDEAFFFFFF=EBGGGGGGGGGGGDGDGGFFFFAEEDEEFEFGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_220/1 99 chr15 95365966 60 15M2I74M = 95366327 452 TCTAGAAATCTCTCCCTCTGTCTCTCTCCTTCACCGAACATGCACAGAGGGAAGGCCATGTGAGGACCTAGGGAGAAGGTGCTGTCTGCAA GGFGGEEGGDGEGGFGGFFFFFGGFDGDDGGGFFFGGGDGGGGGDAEFDFDGFFDEFDFFCEEFFEAGFGGDGED?ECECEEEEEECE@:B XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:2 MD:Z:89 foo_220/2 147 chr15 95366327 60 91M = 95365966 -452 AAGGGATTTATGAATAGATATTTCAAAGATCTCTAAAGAGAAGAGAAGAGAAGTCCTTTCTTCTGTTCAAGCAGATGATTGTTGTAACCTT EEEC?CFFDFFEEEEDFAFDBF?=EFF=FDFDDEEDEFDFFEBFFFFFDFFDD:DC>CCCCFD?FF=FDGGEGFFGGFBFGAEEEE:DDDD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_222/1 99 chr5 2818909 60 91M = 2819293 475 AGTGTTTTTATTAATATATCTTCTCATAGATTAGAAGCCTTGTTTTTTTCAACATTGGCATGTTTATTTACTGAATTTATCTTAAAATAGA BA5ABDEE@BFDFFFFFFEEFFFFFFFFFBFFFFDFDFFDE??EABABB5CBCC8/BCB5BDDDCDFEF5AC@=@C? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_222/2 147 chr5 2819293 60 91M = 2818909 -475 TGTAATCCCAACACTTTGGGAGGCTGAGGCAGGCGGATCACCTAAAGTCGGGAGTTCGAGACCAGCCTGACCAACATGGAGAAATGCCATC A:>>??C@@CA?=?C=CEEB?EDDAE?DBD?EEBEECB=B?FFFFFFFDFFDG?GGEE?EBEEEEEEGGFGGGGFGFFF=GGGDDGDGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_224/1 99 chr3 142788081 60 91M = 142788459 469 ACTACTATTCATGTCATTTAAAGTTAAAGGATACTTCTTTGTTTTGGATTAACTTTTAATTTTTATAGCTAAATGTTTACATCTGTTATGT DFDFFEFBFFFFFFFFEEFFFFFFFFEFFFFFFFFFFFFFFFBFDEDBDBFFFDFFFBDFFFFDECADFFD?FFFCEDFFFEFFFAABDEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_224/2 147 chr3 142788459 60 91M = 142788081 -469 TTATGTTTTTCTCACATTAAATTTCTTGAGTTTCTGAAAATGGTGTTCCTTCAGTGTGCTCTGTTTTCTAAAAATCCATAGTAATCCATAC GGAGGDGDGBFFFFFFDGEDFGGDGGGEGDBGGEDGFGDGGDGGGGGGGDGGGGGGFFFFAGGGGGFFFEF?GGGGEGGDGGFGGEGGGDE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_226/1 99 chr10 65516550 60 91M = 65516941 482 AGGCCTCAGGAAGGAGACTCAGAGGGTGCAGGAATCAGTGAACCATCCCCTTTCTTCCCACAGTCGCCCTATCACATTATCCGTTTTGGTG GGGGGGGGFGGGGGGGGGGGGGGGGF?FDFFFFFFGGGEGFEGGGGGGEGGFFGGGGFGGGFFEDFDGEFEFEEGFFGGGGFGEGFGGGEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_226/2 147 chr10 65516941 60 91M = 65516550 -482 CCCACCCTCTTCCTTAGCTGCCACGATTCTCTTCTGCTGTTTCCCATGTAGTCAATAAGATGACTTGAAGATTTACCTTCAGAAGAGCGCT ###########?184-:FAFFGEFGGFDGGGGDGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGFGGFDGGGGGGEGGGGG XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:0A3T86 foo_228/1 99 chr4 108147713 60 91M = 108148076 454 TTATAGAAAGTTTACTTTTTTAGTTTAGTGATTTTTTTCCTCCAGTGGGAGTTCTTGAATTTATAATGCATAGGTGATTTGTTTGTGATGA @6?536(127@15-50668@=B=7C642*@2>5)@############################################ XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:51A14A24 foo_228/2 147 chr4 108148076 60 91M = 108147713 -454 TTTTAAAATAAATATATTTTGAATAAAAACACTACAAAATAAGAAATGATTTGAAAAGAAAGCAAATATTTTGGGTGTGATGGTGCTACAG D:C;8;?AC?CACAC?:AEFCA888=AAAC:D;D>>EEAEBA>AA>A:?AC?>>6AA?D5ADD?CA>A7?7873+;99=B=B= XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_230/1 83 chr20 20635948 60 91M = 20635572 -467 AGCAATCTTACTTCAATAAAATCTAAGGAAATATCGAAAAAGCTGCTTCTCAAGATGTTTACTGCAGTATTACTTCTATCAGGAAAAGGGA FGFGGEGGF?GDFGGGEGGGFEGGDGGFGGGGGBFGGGGGGGGGDFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_230/2 163 chr20 20635572 60 91M = 20635948 467 ATGTCATGATGATAATAACCAATAAAACTAAATATATGATAGCATTCTATCATTTCATTAAGGTCCAGATCCCATGTCCCACAACCCCCCA GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGFGGFGGFGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGFGGAGGGGGF= XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_232/1 99 chr2 91680180 9 91M = 91680212 123 CCCCTGAAAACTAGACAGAAGAATTCTCAGAATCTTATTTGTGATGTGCGCCCTCAACTAACAGTGTTGAAGCTTTCTTTTGATAGAGCAG GGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGFGFDGDFGEEGGGGGEGGGGEGGGG?FFECEGEGFFGGGGBGEGEEG XT:A:R NM:i:2 SM:i:0 AM:i:0 X0:i:4 X1:i:5 XM:i:2 XO:i:0 XG:i:0 MD:Z:21C10A58 foo_232/2 147 chr2 91680212 9 91M = 91680180 -123 ACTTATTTGTGATGTGCGCCTTCAACTAACAGTGTTGAAGCATTCTTTTGATAGAGCAGTTTTGAAACACTCTTTTTGTGGAATCTGCAAG 4FGBEGGFGGGGBGGFGEGGDFGGGGGGGGGGGGGGGGFGGGGFGGGGGGGFGGGGGGGGGGGGGGFFCEAGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:3 SM:i:0 AM:i:0 X0:i:1 X1:i:2 XM:i:3 XO:i:0 XG:i:0 MD:Z:20C20T20G28 XA:Z:chr2,-91689386,91M,3;chr2,-91680212,91M,3; foo_234/1 99 chr2 142360798 60 91M = 142361182 475 AGAGTACTGTACTTTTGCTTTCATTCCTTCTATACATTCTGCCTTCATCCTTAAATTGTTCAACTCGATAGTGCTAATATTGGTAGATAAT FFFEFGGFGGGGGGEGGEGFAGGGFGGGFEFFFAFGGGFGGGFGGGDGGGGGGEGGGGGGFDGGBGFDFEECFGDDFFFFE:EA=E=CBEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_234/2 147 chr2 142361182 60 91M = 142360798 -475 GTGCCACCATGCCTTGCTAATTTTTGTACTTTTAGTAGAAAGGGGGTTTCACAATGTTGGCCAGGCTGGTCTCGAACTCCTGACCTCAAAT FEBEFEF=FFFFGEDGGGFGGGBGFGEEEE?GGGGFGGGFGGGGFGEGEGGGGGBGEEFGFGGGFEGGD=GDFEGFFGGGGGG?GFGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:28T62 foo_236/1 81 chr20 47287266 37 55M1D36M * 0 0 CTGTCTCTAAAAAATGAAATGAAAAGTACGTATTTATTGTTGGAAATTAGTAAATAAAAAAAATGTAATTCTGTGTCAAAGAGCTCACCAC DDC@B?:??DDDDDDDDBDDDAD=BDA=CDBDB:ADDDBBDDDDDDDDDDCCCC>CCDDDDDD5DDDDDDDDDD=BDDCDD<<>;;3686; XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:55^A36 foo_237/1 99 chr8 120141611 60 91M = 120141982 462 ACAATGACTGTAAGAAAATAAGAGGAGCAACCAAAAAGTGTTTTGGAAGCTAGCAAAAGTATTAGACAAATGGTCACTGTCTTAACAGAAC GGGGGGGGGDGGGGGGGGGGGGEGGGGGGGFGGGGGFDCGFDFFFDGGFGBGGGGGFGGCEEGFEEEFEGCDCA:CCAC>GDGG?GFGDDA XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_237/2 147 chr8 120141982 60 91M = 120141611 -462 GCTACAGAAGACTAAAAGAAAGTAAGAACAGTGAGTTTCTTAAATCCCTTTCTTAATCAGAATGCTACTGGTAATATACACATCCCCTTAA F=<;<9BGFGEGDGFFGFGGGFGGGGGGFGGGGGGGGGGGGGGGGGFFFDFBGGGGGFGGGGGFEGGDFGGFGGGGGGGGGGFGGBGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_239/1 83 chr18 36107132 60 91M = 36106763 -460 GAGAGGCAAAGATACCTGAATGCCCTAAGTGTTTCTTTTTTGGGGGAATATTTATATGAAATTGCCAATGTAGTCTGTCTCTCTTGATGCT A5@@=56;:89:?>AC5ECCBD5AEF;>9@-A5?EEC@5@4@C=5CEE=:EEEE?EB:DDD::DDEDEBDDEBEEDEDFDFFDFFFEA=EE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_239/2 163 chr18 36106763 60 91M = 36107132 460 GATTAAGCTTCTTGTGTTCAGTTTCCCTCTACTCCACTGGCAAGGACTTGAACTGCTCTATTTGCTTTGCACAGTCCCATTAAATTATTCA C==@=?CC:C@=B@BDEDE5DD5?DEADEDGDDDGFFGAGBD?D=AACA>5AA>AD:D5DDDD;??DD?5DD:=BGGGDFA?E5EEECE:C XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_241/1 99 chr13 21602081 60 91M = 21602469 479 GAAGATGGAAACACACACAGCTCTTACTTTGCCCATACCCTGGTGCCCCTGGAGCAGAAGTCCTGATGTGGTGATATCATTTCTATGCCTT FFFFFDFBFFFE?FFFDFEFAFEFEFFFFDDFFDBFFFEFFFF=FEEDEEE?AEBECCCBBBBCEEEEBBC=C=>DCCDCFEF=FFEEDFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_241/2 147 chr13 21602469 60 91M = 21602081 -479 AAATACATAATCAAATGGGTTATATTCTGAGATTACAGTAAGAAGGGGCAGGAGATTAAAGTGAGATTGACAAAATTAAAAATATTCAGAG =EEE:EGBGEEFBBFDAGGEEDDGGGDDFGGGEGGEGGEGGEEEEEEBEEDFGFGDEEEEEDE=GGFGGGGGFGGFEEFFEEDGEGGGGGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_243/1 99 chr14 98035327 60 91M = 98035718 482 CTACAAGATAAGCCTATTTCTTAACACACCTATGTCAAGCAGAGGTTGTAATGCAACGTTCAGCTACATTAAAGGGCTTCCATGAGGAAAA GGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGFGGGGGGGGEGGGGEDFFFEA?EEGFGE:FDFFFGFFGGDGEFGGEGEGGEGDEGEDECD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_243/2 147 chr14 98035718 60 91M = 98035327 -482 TAAGCCCTCACTCTGCTGCAACGCTTTACAGTCTAGCTGCAGTGCTGTTCAAATGAACTTTCAGTGATGATGGAAATGTTCTATATCTGTA EBB4AECEEAE5FFGGGGG?EFFDEFGGFFGGEGFGFGDGGGFGGGFFFEDGGGGGEGGGGGGGGGBGGGEGGGGGGGGAGGGFEGGDGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_245/1 83 chr2 220279769 60 91M = 220279387 -473 GCAGTGAAAGCTGAGCCCACCTGGATCCTGCATTCTAGGGTTGGCTTGCAGTCCACAAAGTGGGGGAGGAAGGGAGCACAGAGAAATAGCT CBDEEDCB@5CFFDEF@@:BB?EEDEBD?DDDFBBFE?FEEDEFDBAFEBFFFFEFFFEEFFFFFFFFFFFEEE:E?=FFFFFFDFEDFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_245/2 163 chr2 220279387 60 91M = 220279769 473 TGCCAGCCTGGCTAATTTTTAAATTTTTTGTAGAGATGGGGTCTCACTCTGTTTCCCAGGCTGGTCTTGAACTTCTGGGTTCAAGTGATCC GGDGGDEED?FFFEFG=FGGGD?GFFGBGGGAFGGFEDDDGFG?GEGGGGBFFGDFFGF:EEEE=4-<+1B.:B=;5?;(C.CC>73*5?A XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_247/1 83 chr7 126844340 60 91M = 126843963 -468 GAAGAGTATTTTGTACGGCATAATACAAATGTTGGAATGAGAATATAATGCTCCTGATGCTGTTAGTCAAGAGGTTTGCATAATTTGGACA GGGGGFGFGFFDGDFGGGFGGGGGEGGGDGGGGGGGGGGGGFGFEGDGGGGGGDGGGGGGGFGGGGGGGGGGGGGGGGGGGFGGGGGFDGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_247/2 163 chr7 126843963 60 91M = 126844340 468 GGGACTGTCTACAAAGAGGCACACGAGTACTTTTTGAAGCAATGGAAATATTCTATATCTTGAATGTGATAGTGGTTACACAACTGTATAC GFGEGGGGGGGGGGGGGGGGGGGGGFFAFFEEEEEEGGGGGGFGFBFFFFGGEGGGGGGGGGGGGGEGGFDFDFFFDEDB########### XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_249/1 83 chr20 51388009 60 91M = 51387633 -467 CGGGGACCCCCCATAGTGTGTCCATGAAGTCAAACGTGGTCCTGCAAACTCTTTGCAAGGTGCTGGCCTGTCGCAGGCATTCCCCAGCCAT ###################################################A?64693?9,539=2=);=7@>=4C:72?8D-DDDDDDDD XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:1A5A1A72A8 foo_249/2 163 chr20 51387633 60 91M = 51388009 467 AAGATAATTAAATAACATTGACTCCCAACGTGAAAAACTCATTTCCCTTCCATGCGTGGCAGCTGAAACTGCTTTTTGATGTAGAGTAGGG GGGGGGGGGGGGGGGGGGGGFGGGGGGGGGEG=GGGFGGGGGGGGGGFGGGGGGGGGFGGGGBFFFEGDEF?EGGGG?BFFDEFDB5@?CC XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_251/1 83 chr4 104178743 60 91M = 104178367 -467 AGGTGTGAGCCACTGCACCCAGCCAACGTAAGACTTTAGTTGATACTCATTGTAGTAAAGAACAAGAATCAACTCTATTAATTTTAGATTA ?@157-=?=C?>C:CC?CBCCEFCGDGGDGGGGFFFGDGEGGGEGGF=FEDGGAGGGFGFGGEGGGGGGGGFGGGG?GGGGGGGGGGGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_251/2 163 chr4 104178367 60 91M = 104178743 467 GCTAGTTCTGCCAAAACTACGATCCAGTTAACATCCTTCCACATATCTTTAAATTTTACTTTTTAATTTAGTTTTTTGAGTATATTATAGC DEEEDEFFFBEEDEEGGGFG?EEEEDFFFFGGFGGGDGGGDFGGEGGEEGGGGGGGDGGGGGFGGGEGGBFEFFFEDEADCEFDEGGBGDG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_253/2 145 chr6 74622818 37 91M * 0 0 GATTATTTAGTTGCCCATTTTTTTTCCTTATTCTGTTTCCCCTTAGATGTTAGTGATTTCTAAATGTTTTGGCCCTATTTCAGCTATACTG #####################A:A:B=>6C@?*=*?4A+A-==?5=C@6CC-5*.-@@@;;D?D,DBBBB=9)3/8??DEFCC?5C=BDDD XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:38T33A18 foo_254/1 99 chr6 92412781 60 91M = 92413141 451 ACCATTATTCTATATAAGTGGCAAAGGGAGGAAGCAGTGACCAGAATCCAGAGACAATAGCTGGAGCTGAAGGCAGCTGGAGCAAAAATGC GGGGGGGGGGGGGFGGGGEGGGGGGGGGCGGGGGGGGBGFGGFGFGFGGGGEGBGFEDEEEDCF@>>-=-?<><;7747=0500;>7@588 XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_254/2 147 chr6 92413141 60 91M = 92412781 -451 GGGCCTCAGGGACCTGAGGTCAGCCTCTGGCCAGCAGCCTGCAAGGAGCTGAGGCCCAGAGTTCAACAGCCCTGGGGAAACTGATGTGCCA ####################?.;B01?=BAD?F=GFFFGFGGEEGGFFGGGADGGGG?GEFGGBGFGGGFGGGGGGGGGGGGGGFG?FFFF XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:0A6T0A3A78 foo_256/1 99 chr12 98617437 60 91M = 98617820 474 CAATAAATATTGACAGAATGAAAACAATACCCTTAATAAATGTGAAAGAACAGAGTCTTCTTGTATTGCTGTTTTAAATATCCCGTAACTT GGGGGGGGGGFGEGGFFEGGGFGFGGGGDGEGGGGFGGGFGFDGGGCFGEFDFAFDFFFFEFF@EFDEBEACFF?BEEEDGBFGGFEAGFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_256/2 147 chr12 98617820 60 91M = 98617437 -474 TTTAAATATGGATTACAAATCCAGCTGCCTTAAAATGAAAGTAAAAGGTTTTCCTTTCTCAGTCTATTTTGTAAGTTGGAAATTAATGTAA EC3CCCC=AEB@C>>?BA-@C?D:GFGGGGGGGGGGGGGGCGGGGGGFGGGFGGFGFG?GGFGGGGCGEFGFGGFGGGGGGGGFGGGDGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_258/1 83 chr20 14052099 60 91M = 14051713 -477 TGTAGGGAAGACAACTAAGACCCCAGCTCCATGCTTCACCTGGTCACCTAGGCCCTTTGAAGAAATCTTGCAGTGCGGCATACCATTAGTC EGGFDGGGEEGFFFFFEFFBDGEGGGGEGGGFGGGGGEGEGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_258/2 163 chr20 14051713 60 91M = 14052099 477 ATGAACAAATGCAGATAATAGGGTACATAATTTTTAGTACTTGAAGCATAGCCATGAGTGTAATCTGATTGTCGGAAAGTATACAATTCTT GGGGGGGGFGFGGGGGGGGGGGGEGGGGGGGGGGGGFGGFGDGGFGGGGDGGGFGGGGDGGGGGGGGGGGGGGGGGGGGEFFGGGEFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_260/1 99 chr16 63662422 60 91M = 63662802 471 TGTGCTGTCCTCGTCACTGTCCGACTGAACTCTTCATTCATTGAACAAACAGTTCAAAAGGTCCTTTGAGGAACTCCCTGGTGCCCTTTGG FFCFFDFGGFFGGGGFGFFGGGGGC=EEEEDGDFGGFGFFDGGGEGGFGGGGEGEGGEGEFBGEGGGFFFE5BBEFFFEFGBGGGF?FF=F XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_260/2 147 chr16 63662802 60 91M = 63662422 -471 CCCAAAAAGGGAGAAACGACTCTAAAATAGCAGAGTCCACTTTGAATCACAGTGAACAAACGGTTATGATCCATTTATGGTGATGAGTAAG #@+DDDD?DD=C@CBA=E:ADD>FFFGDF?GFFEFFDDBAGGGGDGGGCGGFGGGGFFGAFGGGGGGGGGGFGGDGGGGGDEGGGBGFGED XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T90 foo_262/1 99 chr18 25007594 60 91M = 25007971 468 CAATATCTCAATCTTTAGAACATACCTCAGCATCTCCACATCTCCTCCTACCTCTGTCCTGTGTTGCTGGGACACAGGGGCAGGAAATGGA GGFGGGGGGGGGGGGGGGFGFGGGGGGGFGEGGGGGFGGGGGDFGFFGGGFGGFGEGEGGEEGEDEEBEEFEE?EBCCADCCCC??;:<>6 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_262/2 147 chr18 25007971 60 91M = 25007594 -468 AATTACTCTGGTTCATTTTTTGTCAAAGAATTTTTACTGCTGAATAGTGTCATAGACCTATTCATTTATTTATTTTCTCACTCTTCCCAGT EDEFDGGGEGGFDGFGGGGGGGGGGGFGDGGFFGGGGGGGGGGGGGGGGGEFGGGFGDGGGEGEGGGFGGFDGGGGFEE5EGGEFGGDGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_264/1 83 chr10 120158272 60 91M = 120157889 -474 AAGCAACTTCGTTTTTACCCCCACATAGGTAGTCAGCAACACAATATCTTGTAGTTGTGAGTGGTTAATTGAAGGATAATCCAGTAACACA GGGGEGGGEGDEDEEF8EFFFGEGGFFGGGGGFGGGGGGGEGGGGGEGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGFGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_264/2 163 chr10 120157889 60 91M = 120158272 474 AGACGAGGGCCTGTTTGGGCTCAATCATTAGAATCCAGGTTGCTGGAGAGGTTCTGTGAATCTCAAGTGGAACTTGGCAAATTTCAGCAAA GGGGGGGGGGGGGGGGGGGGEEGGGGGGGFGFGGGGGGFFGGGGGGFGFGGDGGGECD?EECEEEGDCFGDCEGGFGGAEFEGGEEFDFEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_266/1 99 chr9 83424580 60 91M = 83424973 484 CTACAGAATTTAAACAAGATGCCTCTGGGGTTTCTTCCATTTTGACCATGAACTTAGATCACTAGAATGCTCACTATGGAAGACATTGAGC FBDFFGFGGGGGGGGGGGG=GGGGGGGGEFDFFEFBDDCEGFGGDEDFGFDFEGGFGEGEEEFFFEBEEBDFDF?FCDCEGEBG=?DEC5: XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_266/2 147 chr9 83424973 60 91M = 83424580 -484 GAAGTCTAAAGCTTAACACTGAGCCCGGCAGAAGCATAATTTACAGCCCCAAAGGGAAATACTATCCTACAATGTAAGAACCTCTCCTCTT =CDB:D=FFFBC??::GFGDGGFGCFFGFGEGGFEGBFDFEFADAEGGGGGGEFGBDGGEDGFDDFGFGGGFGGDGFFFDFGGEFGGDGGD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_268/1 83 chr2 40577370 60 91M = 40576992 -469 CCCACAGCCTACATATCCTTCAAAATGTGCATGAGTAATGTTAAAAAATCTGTTTAGATGATTTCCAATTTCATCATTCTCAGTAAACTAC GGGBFFGEGGFGGFEGFFFEGGGGGEGFGGGGGFGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_268/2 163 chr2 40576992 60 91M = 40577370 469 ATTATTCAAAAGGATTTCAGAACTATCTGAAAAGACAGCATGGCTGTATGGCACGTATTTCAAACAGTTGGATCAAGATCTGAATAATGTT GGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGFGGGGEFGGGEGGGGGGDGGGEDGGGGGGGGGGEGGFFGGGDEFFGEGFGEGGEFEB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_270/1 83 chr2 40577370 60 91M = 40576992 -469 CCCACAGCCTACATATCCTTCAAAATGTGCATGAGTAATGTTAAAAAATCTGTTTAGATGATTTCCAATTTCATCATTCTCAGTAAACTAC EEE:EEFFAFEFFFF=FFEEEEFDFF?AFAFEEFEFFDFEFFFFFFFFFFFEFFFFDFFFFDFFFFFDFFEFFDBFEEEEBEFFEFED=DE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_270/2 163 chr2 40576992 60 91M = 40577370 469 ATTATTCAAAAGGATTTCAGAACTATCTGAAAAGACAGCATGGCTGTATGGCACGTATTTCAAACAGTTGGATCAAGATCTGAATAATGTT GGEE?GFGGGGGGDGGFGEFDGGGEDBEEEFFF=FEEEEE5EDDEEFE?AEF=EEF:FFF=EEEEC;?DDEBBDCFFFFEEGGBFE=?E?A XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_272/1 83 chr3 154905363 60 91M = 154904993 -461 GACCATTCCTGAAGTACAAATACTTTGGAGAATGAGACAAATTAATGCCTGACTCCTGTATAAGAAGTATGGGTCTGGGAATGCAGATGAT GEGGGGBDFDFGGGGGFGFGEBEEEEDDGGGGFGGGGEGGGGGGFGDFFDFGGGGGGGGGFGGGGGGGGDGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:88C2 foo_272/2 163 chr3 154904993 60 91M = 154905363 461 CCAGGTACAAAAGAATACTATTACATAGACTCTTCCTTGGTTTGCACAGAGGATGACTAGGGCAATTCTGTCATTCATAAAAACCCTGGTT FFEFFGGGGGGGGFEGGGGGGGGGGGGGGGGGGGGGGGGGDGGGGFFBFDGFBGGEFDFFFFDCA=AC:BAAACDEFF?FGGE;AFC=EAC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_274/1 99 chr3 6977819 60 91M = 6978195 467 CACTCATTCCTTTGGAAATATCCTATAACTTATTATCAGTAAGAGTGAAATTTGCGAAATCTCAATCACTAGCATCTTGTGTTCTCACAGC FGGFGGGGGGEGGGGGGGGGGGGGGGGDGGGFFGGFGGBGFFFEFBFEFFGGBBGGGGEBGGFGGFF5FFF:F:DGGGDFFGDGFABCCED XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_274/2 147 chr3 6978195 60 91M = 6977819 -467 AGTGTCTCCATGGACCTTTACACCAACCAGAAATTCTCCTCCACACCCTCCATCAATCGCACTTTCTCCTCTCCACTTAACCATTCATCCT ###########################?-=@@B9;C7'7)7A-;)>AA:AA5C6B9D;DDD@@B>??DABDDFFFFEEAEE:AEEE XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:37A53 foo_276/1 99 chr11 3666630 60 91M = 3667039 500 TATATATATATATATATATTTATAAGAAATCACCTTGAAGATTTGAAATGAGATGAGCTATTTGTGTGACTAGAGAAAAAAAAAACAAAAA GGDGGGBGGGFG?EGGGDFAFDBDFGF=GDFEDFDGFEGDBBEEBE=FD?D5B?E@5@D=E:DEBEBE5BCBDABE=E?A@>@<>0:?:08 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_276/2 147 chr11 3667039 60 91M = 3666630 -500 AAAACAAAACAAGGCTAGGTGCAGTGGCTCGCGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGTGGGCAGATCACGAGGTCAGGAGATCG #########@;;:4;->3==7B-4==DD=D:EEEE:DFDF:FB=FEFDDEDDDGGGGGEBGEEEAECAAFFEEEEE:EED=GGBDGDGDE= XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_278/1 99 chr11 112652415 60 91M = 112652768 444 AGGCTTTGTTGAATGAAGCAGAGAAGATTGTATAGTTGGGGCTGGTCTTGGTGAACACACATTATTACCCCCCACATCCCCTTTGTGTAGA GGGGGGGGGGGGGGGGFGFGGFFGDFAFFFFAFFFDGGGGGAGGE:AAC:90/':>>@>CC:E=A9*6=5?#################### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:71A19 foo_278/2 147 chr11 112652768 60 91M = 112652415 -444 GGGGGGGGAAAAGGGAAAAGCCAGCCCTTTGTATAGAAATTTTGCTTTTTTTTCCCTCATTCTACTTTAGAACTGCAAGCTTGTGCACTGT #################@=.>A@?=D;=DFFACCCC-:DBABB-::E;B?FDADGGDC=BDFFFFFDDEDEFDDAGBEAEDEEA=EBFFFF XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:9G81 foo_280/1 81 chr14 43458764 37 91M * 0 0 CTCATTGTGGTTTTTATTTGCATTACTCTACTGATCAGTGAGGTTGAGCTTTTTTTCATATGCTTCTTGGTCACATGTATGTCATATTTTG >::98;EE=@@;EFFBECDEED@C?DAEEBDDGGEEBGGGGEGGGGGGGEGGGGFFFFFFGGGGGGGGGGGGGGGGGGGGGDGGGGGFGGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_281/1 99 chr3 75449080 60 91M = 75449462 473 AGGTAACAGTGAAGCCCATCCTCCCACACCCCCAGGAACCTGGAAATAATAGAAATGAAGCTGCCTCTGAAGGTTCAATTAGGTCCATCCA FFFEFFFFFEGGGGGGGGGGGGGGGDGGGGEGFEGGBFFGFGDGGFEGDGFDFGGBDDEEDGFDEAC=EC:A?==@@B>0@EE5EEEDEEE??CCB=?-;1=?9@B5@C=DC@AC=>CC:CACC5AEDCEEDCDBB?ECCEB-><@>@2==@2726::?### XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_287/1 99 chr5 4001037 60 91M = 4001407 461 CTTTATATTTGCTGGCAGCTGATTAGATGGTGCCCACCTGATTAAGGGTGGATCTGCCTTCCCCAGCCCACTGACTCAAATGTTAATCCCT EEEEDEEEEEEEEEDEEE?BAAACAAAAAC:AAC?AACCCEEEEEDBC-B8(4<7ECB:CC:DACAABB@BDE:BCDDD?BE?D=CB?@CA XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_287/2 147 chr5 4001407 60 91M = 4001037 -461 AATAATAGTAATAATAATCTTATGCCACATGATAAACGAGAAGGGAATAAAATGAAGATATTTTCTTAGCATAAGTGTATACATGCACAAG E=;-:==:EAE;CFFD5BDEE?@5ABACCCA?:EGGEDEDDFFF>EFGGGBDDD=D=D;>CDC:BD=EE?EAFFDFDEEDEEEE=EFFBDF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_289/1 99 chr5 8102459 60 91M = 8102844 476 ACTCAGGGGTCAGGAGCCTTCATAGAGTTGCAGTCATTATTTGTTTACCCTTTGGCTGTGCAGATGACTGTTTGCCCCTCAGAGGCAGAAA GGGGGGGGGFGDEGBGGGGGGFGGGBGGGEEGGFGFGGEGDFDGGGGDGGFFDEFEBE:DD@D??>8>>='=520BAAAAD@=BBD?AABD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_289/2 147 chr5 8102844 60 91M = 8102459 -476 AGGATCACTTTGAGTCCAGGAGTTCAAGGCTGCAGTGCACCATGATTGTGCCACTGCACTCCAGCGTGGGAAACAGAGAGAGATTCTAGAA 5BAAC?A=CDDEDEE5FDFDFAABA>FAFDEDDDDDEDCEDAEEEEEEE?EDCDD:?BEAEGGFGGDFGDGGEGGGGGGGGGGGDGFGGBG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_291/1 99 chr1 113801412 60 91M = 113801792 471 ATTTGCTTGATATCTGTAAACAGACTTTTCTTTCAGGAAAGGAATTTGGAGGAGGGCAGAGCCATCATTATCTCTTTTTTACAAATCTACA FGFGDGGGGEFGGGEGGFGGFFFGFGGGGFGFGGEGGFGDEGGGGFGGFBDFGDGFGFGDEAG:BGD=GADFDFBFFFEDCGDGEGGDDGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_291/2 147 chr1 113801792 60 91M = 113801412 -471 TCTGTTCTTAAGAGGATTACATTGCAGAGGGGATAGATAGATAATAAGCTGTGTATAAATAAGATAAATTTAATTATAAATAATACTGATA EE:?EEAD5=DGAFGGDGGGGGFGEFFGGDFGGGDGGDGGGAGGGGGFGAEFBGGGDGDFGGGDGGBEEGFGFGFGEGGGFGGFGEGGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_293/1 99 chr13 103952241 60 91M = 103952612 462 GTAACGATGAAGCCAAAGCCCTTCTTCCTCCCTGATATATGTGCTAAAAGATAGAGACAGAAAAATGCTGAGAAGAAAGAAAAGGCTGAGC GGGGGGGGGGGGGGGGGG?GGGGGGGGGFGGGGGBCBFFFECEE5CECCEFGGFDGEGEGGBGGEEGGGGEFEEGGBGGDFGG=FFGGEGB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_293/2 147 chr13 103952612 60 91M = 103952241 -462 AAGAATTCAGTTTGTAGTGAAGCAGAATCCTCAGAGAGGAAGGATGATGGTACAGTCTATCTTGCCTTGTATATTAGGTTTTATTAAAGTT ECCE::GGGABFEGFGGEGGGBCDGGDEEGGGGGGGGGGEGGGGGGDGFGGGGDGGEEEDEEFBFGGGGGGFGFEGGGGGGFGGEGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_295/1 83 chr18 41680767 60 91M = 41680397 -461 TGCCCTTTTTTTTGGTTATTTCTTGATTATATGCTAAACAAGGGGTGGCAAAGAAAGACCAAGAGAGAAAAGGGCTGTCTTCAGAAGAGAA FEACGFFGFDFGGEGGCECCAGDEDFGGGEGGGGGEGGGGGGDGGFGGGGGGGFGFGEGGDGGGGGGGGGGGGGGGFGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_295/2 163 chr18 41680397 60 91M = 41680767 461 TTGCTTGAACCCAGGAGGTGAAGGTTGCAGTGGGCCGAGATCGCGCCACTGCACTCCAGCCTAGGCAACAAGAGCAAAACTCCCTCTCAAA GGGGGGGDGGGGGGGGGGEFFFFFBEDDDFEGGGAFFDABBDE?EEEEEDF=FB?DBDD=ECDDEEEEFF?D@DAE?E:CADACC;CABAG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_297/1 65 chr9 29739213 37 91M * 0 0 GAAAAGGATGGCTTTTCTTTCTCAACGTGGGATCCTACTCTCCAACTGTGAGAGCAACGCTTCTCAAAGACTATCTCCAGTTGGCCATAGA FFFFFBGGAFFBEFFDGGGGEEGGFFFDDFEGGFGGEEGGGFGGGGEGEBEEEEE=EBEBBAEBED@EEDD?5BCEED:BACEEBFFEFE= XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_298/1 99 chr1 152336073 60 91M = 152336429 447 TGGGAGGCTGAGGCACAAGAATCGCTTGAACCTAGGGGGTGGAGGTTGCCGAGATTGCACCACTGCATTCCAGCTGGGCCAACAGAGTGAG FFFDFFFFFF=EEEEFEBEFFFFFFFFDFFFFFFFFFFDBCC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_298/2 147 chr1 152336429 60 91M = 152336073 -447 GGGAAAGATACGTGACTAACATACTAGGTAGAAAAAGAAAATCATCATCACTTCCTTTCCTTTAAGGACGACTTTCATTCAGGGGAGATCT CABC=ECC?CEECECB5EEEFEFGFGEGEEEDGEFGEGGFGBFFEFDEECEEDDFDDDEFEBEGGGFEGGEGGGDGFFBFDFGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_300/1 99 chr14 97106267 60 91M = 97106655 479 ACTTTATATGGTTTCTGAGATTGCATGTAATGACATTTATCATTTGCCCATGCAAATGTGATTTAATCTTAGTCGGTGATCCTTTATCAGT GGFGGGGGGGGGGFFGFGGFGGGFGGGGFFGFGGGGGFGGGGGFGGGGGGGGGFAFGGDEBGGGBGGAGGGGEDGGFG=GGGFFGCD??FE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_300/2 147 chr14 97106655 60 91M = 97106267 -479 TGGCATCTACGTCACAGATTGAGAAGCATGTTCCATATCTTTTCCAACTCCATAATCCTGTGGGCCAATAGTACAGGGATTTCTAATGTTT AECCCCBF@BFDF=GEGEFDGDGEDGFG?FGDDGGEDDGGGFEFGGEEEEE5DGGAGGGFGGEFGGGEEGFGGGFGGGGEGEGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_302/1 99 chr11 46886041 60 91M = 46886425 475 CAAAAGCTTCATAATAATATAACCATCTTTTTTTTTTCCGAGATGGAATCTTGCTGTGTTGCCCAGGCTGGCGTGCAGTGGCGCGATCTCA EEEEEDDEED?EEEEBEDEEFEFB=D?DDDBFFFFFEFDD5DBAD5<:CBAAC?A?5CCCFEE?:EE@EB##################### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:71A19 foo_302/2 147 chr11 46886425 60 91M = 46886041 -475 CACTGAAGGCTCTCAACACCCCTAGGAGGTTGTACTGTTATTATTAGTTTGATGCAAAAGCAATTGCGGTTTTTGCCATTACTTTTAATGA ?7=?9:-:70=?A)=/4C=CA:9;>;*6>='>54:B@5ACCC;.=>46@B:?EE@@CC;>:AC?:CCC?C?C=C XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:18A11G60 foo_304/1 99 chr1 105905893 60 91M = 105906291 489 AATAATAAATGTGAGGTGATATCTCATTGTGGTTTTGATTTTGCATTTTCCTGATCATTAGTAATGTTGAACATTTCTTCATATACCTGTT GGGGGGGGFGGGGGGGEGFGGGGGGFGGGFGGGGGGGFGGGEGGEGFGGGGGGGEGGGGEGEGGGGGFAEFEEFFGGGGGGGGGGGDGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_304/2 147 chr1 105906291 60 91M = 105905893 -489 CCCTATGTTTTCTTATAACAGTTTTACAGTTTCTATCTTAAGTTTACATTTTAATCCATTTTAGATTTATTTTTGTATAGGATGTGAGTCA DGGGGGFGDGEFFFFBDEGGGGGGGGEGGDGGDGGGGGDGGE=GGGGGFGFFDGGGFGGGGGBGGEGFGGGGEGGGGGGDGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_306/1 99 chr4 87408445 60 91M = 87408838 484 ACTCTACCACAAAGATACACACCATCTTCTCACCATCTTTACTGTGACATCTTGATTCCAACCTTTATCATCTTTCACCTGAATTCTTGCA GGGGFGGGGGGGGGFGGGGGGGGGGGGGGGGGGFFFGGGGGGGFEGGGGEGDGGEGFDGEGGGEGGEGEEGEEEEGFGGDGEEGGGEGFGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_306/2 147 chr4 87408838 60 91M = 87408445 -484 TTCAAATACTCTTTCCCCAGATTTTCATATGGCTGCCTTCTCACCATTCAAGTCTTAACTGAAATGTGGTTTCCTCCCAGAGGTCTTCCTT :BCBD5FDGDFGGCGGFGDGGA:EEEFFF5F?GDGGFFFFDEEEEE?FGGGGFEEGAFFFGDEEGFGGDGGGGGGGGDGGGGGBGGGGGEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_308/1 83 chr4 150596485 60 91M = 150596109 -467 GAGGATTGCCTGAGCCCAAGAGGTGGAGGCAGCAGTGATCTATATTCACACCACTGCACTTCAGCCTGGGTAACAGAATGAGACCTTGTGT GGFEEFGGEFFF:FBFEE?GFGFEGGEEGEFGGGGGGEFGGEFFFFEDFCFGEGGGGFDGGGGFGGGGGGGGGGGGGGFGGGEGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_308/2 163 chr4 150596109 60 91M = 150596485 467 GACATGTCCCTTTTACAACTCCCAGACACTCAGTGTATAGCAGAGTGCTATTCCTTTGTTGAGCAAACTGAGCACCTACTATGTGCTAGAT GGBGGGBGEGGGGGGGDGGGGGGGG?EDE=CCC=ABDDDCEED-D:DDD?BFFDEFFGGGAGEFDFFEEFBFDBED5DDD?ADA=ADB5?A XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_310/1 83 chr4 189625940 60 91M = 189625553 -478 GACCCAGAAAAGAATTAAGCTGAGATATCTCTTAGTGATACTTTCACATCCAGATTTACGCTTAGATTCATGATGCATTACAACTAATAAA E=BAEGEDGGGGGGCGGGEGEFF=FFEDFDFDGGGGGGGFGGGGGGGGFEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_310/2 163 chr4 189625553 60 91M = 189625940 478 ATAAGATTAGTCCAAATTATCAAGTATATTCCATCATTTTGAATACTCATAAAACATATTTTGATTTTTATTATTTTAAGTGAATATTTGG FGGGGGGFGGGGGGDGGGGGGGGGFGGGGGGGFGGGFGEFFGDGFGGGGGGDGGGGGGDGGGGEGGGEGGGGGFGGG>FGFG?FDDGGGGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_312/1 65 chr4 187482106 37 91M * 0 0 AGTGATTACTTCTGCCTCAGTTAGGAAAGAATTAACAGGCGTAATTATAGTTGAGCCAGATAGGAGTTTTCCAGGAGGAGCACAACATCAA 77,56::A=:?A91-:0557ACC83A:A8A;6B;CCCC:-B8.;?/;8?########################################## XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:39A7C43 foo_313/1 83 chr7 7294158 60 91M = 7293783 -466 TGGATAAATGGGTAAGGAAAATGTGGTATATATACAATAAATTATTATTCTGCCTTTAAAAAGAAGGAAATCCTGTCATTTGTAAGCACAC BF?FFGDGEGEAF?GGEFEEE=EEEFDEGFGGEEGDGEGGFEBGGGGGGGFEGGEEGGGGGGFGFGGGGGGGGFGDGGGFGGGGGGGDGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_313/2 163 chr7 7293783 60 91M = 7294158 466 TGAAAAACTGGTCAGCATCCACTAATCGTCACAGCCACAAGGAGACATCACCTCACACACGTTAGGGTGGCCATTATCAAAAAGTCAAAAA GFGGGGGGGGFGGGGGGGGGFGGGGGGDGGGFDEGGGGFFGGDGGGGEGDFEAEFDFFFFGFGGGGGEGGD?=:EFFFF5AAEECEEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_315/1 99 chr3 51577695 60 91M = 51578044 440 AGCCCCATGGTTAGAAACTATGCAGACCTACATGGTACTCTTAACTGCTCTAGGATGAATGAGTATCCTTGAGCAGTAGGGTACCAGGTTG GGGGGGGGGGGGGDGGGGGFFFGGGGGGDGFFFFF:EEEEFGGGGFGGFFFGBGEGGBGDFEA@EEECCECBDD5C=DCBFDFFF?@D@CA XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_315/2 147 chr3 51578044 60 91M = 51577695 -440 GTCAGCATTCCAAAGTGGAGGAGTTAAGGTTTCACTTATGTAGGTAGAGACAGAGAAGTTCCAGCAGGATTATAACATTTTCCACAGAAGA FBAFDFBEE=EAEDDEGGFFDGGFGEGGEEGFDFFDGDFGGAGGGGGEGFGFGFGGGFFCGGBGGDGGGDFGDGFGEGGEGGGFGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_317/1 83 chr7 9669075 60 91M = 9668687 -479 TTTAAAAACACTGGAAGTAATATCATTTTTTGTTTTTGCTTGTTTGTTTGGTTGGTTGTTTTTGTTTGTTTTTTACCACAGGAGGGGTTCA E5AGGGGEGEDGDGGGE?GGDGEDEEGGGEGGGGGGFGGGGGEGFGGGGGGGGGGGGGGGGGGFFGGGGGGGGGEGGEGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:46G44 foo_317/2 163 chr7 9668687 60 91M = 9669075 479 AATATTATTGGATATTTTTGTATAATATATATGGATGTTTTTCATTTGTTTTTCCAGACTCATCTTCCACTCTCTATCCTGCTCCATATGC GEGGGGGFGGGGDGGGGGGGAGGEGGGGGGEGGGGFFDDGFCD5FGGFGGGGGFE5DDDDE5EDEGGFGGFFGFFEB?EE?BBAA=B@@:= XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_319/1 99 chr19 44641888 60 91M = 44642249 452 CCTCTTGCTCCTGGGTAGAAGCGTGGGGAAGTCAGTGAGTGTGAGGGAACCTGGATGTGAAACCAGAGGAAGGGGGTGGCCCTGGGGGTTC GGFGGGGGGGGGFGGFGGGGFGGFGFGGFGE?EEEBEBECFDFFFG=EEFFBGGCEGEG=@??@BAACA?EFFBFC'5A6?A5C-@# XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_319/2 147 chr19 44642249 60 91M = 44641888 -452 CATCCTTCTTTCTCTCTCCTCTCTTGGTCCTACTTTATTTTTCTTGCTGCTGTCTCCTTCCCCCTTTCCTGTGTCCTTTCCTCCATCCTGT D=@DBDC?CE:?5>*:AE?=EEGFGEFEFFGB>@@>5CEFFFABFFDFFDBFFEFFEBBEEEEFGGAFEDGGGFGGEEEEEFFGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_321/1 99 chr10 22132987 60 91M = 22133368 472 ACCAAGACTACTCTAGTAAATTAAGTATATACTAGGATCTATGAGGCTTTGTTGGTTATATCCTGTGTATTCTTGGCATTTAAAAAATTTT GGGGGGFGGFGGGGGGFGFGGGEGGGFFGGGGGGGGGGGFEGGFGGGGGGGFGFGEGGGGGFGFGFFFBFFFAFEFFAGGGEGGGFDFFAG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_321/2 147 chr10 22133368 60 91M = 22132987 -472 CAGCTATGTAATTAGTAGGTCATAGTACTTAAAACATCAAAAACCATGGAATGTCTTTAATTTGGTCAGATATTTACTGATATGTCATTAT GFFBFGGFFGGGGGGGGGGGGFGEGGGGGGGGGGGFGFGGGGGGGGFFFAFFGGFGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_323/1 99 chr12 118588121 60 91M = 118588482 452 GCTTATTTAATTCAGCAAGCACATTCAACATTTCTGATTATACTGAATACATTCAAATTCTCAGTCATCAGCTTCAGGAAGACCCTCCTGG GFGGGGGGGGGGDEFGGDGGFGGGGGGEGGFGGGGGGGFDEDFFFGGGGGGGEGGFEDGGFGFAFGBFD-=DBCBBBD@?:3.BBAB=EE?FEED5DE?EDA?FFFEFGDGGCFDFEEEBEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_325/1 99 chr6 48478084 60 91M = 48478455 462 TTAAATTGTCTTAGTTTCTCATATGAGGGCAAAATAAACTTCTTGGGTCTGTAGATAGTATGTATTAGCCATATCATTTAATAAATATTGC GGGGGGGGGGGGFGGGGFGFDGFGGGGGGFGEFDFGGFGGGGFFGGAEGGGGEGGGGFEFEDBFEGFECGCEBBEFFFFFGFFGGDFGE?F XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_325/2 147 chr6 48478455 60 91M = 48478084 -462 TTAATCTTTAGAGTGGTCTTGAGAAAGCTCATAAGGTCTTTGATAAGTTAATGGAAAATAATGCTAAGATTCAATACTGATAAAATAAAGT ?CCA=??=GDGGGDDGEG?GGGGGDGEFBAEGFGGGGGFBGGGEGGGGGGGGGFGGGGGFGGDGGGGGGFGFGGFGGGGGFFGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_327/1 83 chr2 78122797 60 91M = 78122426 -462 TGTGATATTCTCATTTTCTCAGCCTTAAGAAGTCAAATGTCTCTGTAGGATAAGATATTCAAGGTAATATTTTCCTGTAAAGAAACACAGA ?FG?EFAGEGFFDDFFCEDGGBGGGGGGFGDGFFGGFEGDGGGGGEGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_327/2 163 chr2 78122426 60 91M = 78122797 462 AAAGTAGTGTCACTAGGGACTTCAATAATCTATCTGAAGTACATATAGCAGCATCTGGATTATTTGCAATCTATTAGCTTCAGATCCAAAA GGGGGGGGGGGGGGFGGGGGGGGGGGDGGGGGGGGBEEECEEEEEEFFEFGGEDGGGGBGGGBGGG=FGGGGDBGDEF?GE?D?AEE:EEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_329/1 99 chr10 94708971 60 91M = 94709341 461 AAAGGTAATAAAATGTAGCAAAGATAGTTTTTCAACAAATAGTGCTGAATCAACTGGACATCCACATGTAAAAAAATGAATCTAGACCAGA EEEEEEGGGGFGGGGDEGGFGGGGGGGGGGFGGGFGGGDGE4CEEFFFFDGF=GFGCFGFGGDFGFGG:CC@B?CECEEDFGEFFAEFFEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_329/2 147 chr10 94709341 60 91M = 94708971 -461 AAGACTTATCTGATAAAGGACTCTTGTCTGAAATTGCAAAGAACTCCTAAAACTCAATGAGAAGAAAATGAACAATTGATTTTAAAATGGG EACBBE=EBCEEEBGGEEEC?G:GGEGFGFFDGGDGBFFFDEE?DEEEEEEBEEEEGGFGFFGGFGEFGGGGGGGGGGGAGFEGGFGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_331/1 99 chr13 69209029 60 91M = 69209430 492 GTAGCCAATATGAGAAATAGAAACAGCTCTGCTCTCGGCTCCCAGTGAGAAGGAAAAAATGGCAAGTGAATTCTGCATCTTCAATTGAGGT GDGGGGGFGGGGGFFDEDEEGGGDGFFGGGDFFGBFFFFGD=EACB@?=@AAC9'9>>:>4/<):>>?5;@:=C@E?CB?###### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:12G78 foo_331/2 147 chr13 69209430 60 91M = 69209029 -492 AACAGGAGTATTTGCATACTTATGCTCTGGAAACTCCCATGAGGCAGGAGATCCGTCCACTCCCGTGGGAAGGAGGCTGATGCCAGGGAGC ######################?C@?B?CDCEBDEFEBE:FCEBBCEDECCAECCBBC2BBAGFGGGEGGGGEEFGFDF:FGGGFGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0C90 foo_333/1 83 chr11 88357992 60 91M = 88357628 -455 TCTGGAAAAACTATTTTCATACCCACAATCCTAAAGGTTAGATGCCATAACTCATGCTCCTACATATATAACCAATTTTGTTTATTTATTT @BBEBDA?A;?<@CC@==???=@>=>@?;C?:FDEFCBBCB=BFAFEGEEGEDDGGGEFGGEFGCGGGGGDGEGGGGGGGGGFGGGGFFGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_333/2 163 chr11 88357628 60 91M = 88357992 455 TCCAGGGCATCATATACTAAAGAATCATTCCATCTAATTTCTTCAAATCTCAAGAACACACACCTGGCAAATCATGAAAATAAATTTTATG GGGGGGGGFGGGGGFGGGGGGGFGAGFGGEGGFGBGFFGEFEFFFGGFGGFGGGDEFGGDG?GGDBEEE=DDEBFEE-EE5C>>>DECE4A XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_335/1 99 chr3 62264056 39 91M = 62264419 454 CCCGGCTAATTTTTTGTATTTTTAGTAGAGACAGGGTTTCACCTTGTTAGCCAGGATGGTCTCGATCTCCTGACCTCGTGATCCACCCGCC GGGGGGGGEGGGGGFFEGGGGGGGGGFGGGGGGGGDCGGGGEGGFBDFBBGGGGGDDDGBFBDDD=ECBEB=CEEEABCEBB:EBEEEECG XT:A:U NM:i:0 SM:i:2 AM:i:2 X0:i:1 X1:i:139 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_335/2 147 chr3 62264419 39 91M = 62264056 -454 ACACTGAGGTGTTTCCCTGTCACACCCCTCACTTGAAATGCTATCCCCTTCACTCCAATGCTCAGATGATGACCTTCTCTTTGAAACTATT ;@<=9>>>+DADDEFDE5EFADEEEE5EED=EEEEEEEEAEEEFGGGGEGEGGGFGGGEFEEEEEE-EGGFGFGGGFG XT:A:U NM:i:1 SM:i:37 AM:i:2 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:14A76 foo_337/1 99 chr1 74251269 60 91M = 74251637 459 AGTTTAGGACAGTGCCAGATGTCAGCTAGTGAGAGATAAAAGATGAAAAGTTCCTCAAACAGTGTTGTAATGCAATAAGCTACATTATACA GGGGGGGGGGGGGGGGGGGGGGGGGGGGGDFDFDFGGGGGEGGGGGGFEGEFEEGGGFFGGGEGBCECCEEGDEEGFGGFFGBGDGDAAEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_337/2 147 chr1 74251637 60 91M = 74251269 -459 TCCTCTCCCTACTTCCCCCCTTCCCCCTCCAATAGGTTCCAGTGACTGTAGTTCCCTTCCTTGTGTCCATGAGTTCTCATCTTTTAGCTTC 5==1=0=B>:7>>-C?'AA<:9C(DEGEGGGGGGGGDGGGGGGGGGGGGFGGGGGGGFGGGGGGGDGGGFGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:16A6A67 foo_339/2 129 chr12 107536519 25 91M * 0 0 GACCCCGGGCGTTAACTGACCCACCCCATGGCAGACACTCCCACCTTCTTCCATCAGCACCCCCAGGACACTTTGAACACCCCTGCCTGCG ,333,+6,.,767,7766+3,77+611(86733;,7,6=(57+937367*6664*77776AAC8:A?-:A@6.>6.CC@?-?B5B>@.@## XT:A:U NM:i:4 SM:i:25 AM:i:0 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:9T13G56A8T1 foo_340/1 99 chr2 169728182 60 91M = 169728563 472 AAAAGGCTGTGAGACACGAAGGCCCAATAAAACCTCCTCATTGGATAAAATGACTGCCCATCTATTTTTCTACCTGGATTGGATGTAAGTG GGGGGDFGFGGGGGGGGGGFGGFGGGGFDGFFDGGGGFGGGBGGGGBGGDC=B?CCB@D:>=>>@7??####################### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:73A17 foo_340/2 147 chr2 169728563 60 91M = 169728182 -472 ATATTTCCCTTTCCCTCTTCATTTGCAGAGGAAAATAGACAAGAAAAAACTCTCAAAGGGTAGAGTCAGAAACAGTGAGAGACATTTTAAC EBEEEEEFFFEBBDFEDBEFDF=EDAGFGGGGGGGG?DFFGEEEEFEECEBGEAGGGFGFEGGGGGGGGGGFGGGGGGGEGGEGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_342/1 81 chr5 137913472 25 91M * 0 0 TAAGATCAACTTTAAAGTTATGAGACTACTGGGTAAAGCAGATTCTTGTTTGCTATAAAGATGTTAAAAGGTGTGAAAATTATATATATAT ########################################################################################### XT:A:U NM:i:4 SM:i:25 AM:i:0 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:1C26T4A6T50 foo_343/1 83 chr3 121089753 60 91M = 121089378 -466 AAATTGTTTAAAGCAAAGTAACAACTGTTGCGGGAAGTCAGGGACCCTGAACACAGGGACCAGCTGGAGCCACGGCAGAGGAACATAAATT GEABEEDABDC:=;B>9;668=?5?6?CEE=AEBEBC@A=CC@::DEE=EEEEEEEDC:DDDEF=FFGGFGGGGGGGGGGGGGGGEGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_343/2 163 chr3 121089378 60 91M = 121089753 466 TCTTCAAGAATGCCAGAAACACATTATAGACAGGGTATATACAGGGTATCAACAGTTGACTTCTTAACAGAAACAATGAAAGCCAGAAGAA GGGGGGDFGGFC?DFAEEEEDG=GFEEEEEEEEEB-CA?:BDDDDDCEDBDD?DB=CC=:D?BCDBEAECBCEBBCE:CCFD:EF?5@=5C XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_345/1 99 chr9 28435444 60 91M = 28435819 466 GGAATGAGCTAGAAAAACAGGAAATAAACTGGAGAAATAGCTATGTACACAGTAGCAGGCTTGTGAAGGAGACAAAATAGTGTCACCATCT GGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGFGGGGGGEGGGGFGEFFGGGEB?B############################# XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_345/2 147 chr9 28435819 60 91M = 28435444 -466 TAATTCCTTTTGTGATTCTCTGTTAACCCAGAAGGGGGAAAAAAACATGATTATGGGTTATGATTTACTACCTCACATTACTAAGGGCATC FDFEAFGEEEGBCCC5DFFFAGGFEGGFFGEGFGFGFCEEEGGFGGDEGGFGGEGGGGFGGFDFFFEFEDFGGGDGGFGGGGGGGGFGGEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_347/1 99 chr4 167906195 36 91M = 167906577 473 ATATACAAAAGTTAACTCAAGATGGATTAAAGACTTAAATGTTAGACCTAAAACCATAAAAACCCTAGAAGAAAACCTAGGCAATACCATT GGFGGGGGGGGFGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGEGGGGGGGGGGEGGFGEEEGGGGEEFGGF;GGEGFGGGGEGGEFGE XT:A:R NM:i:1 SM:i:0 AM:i:0 X0:i:2 X1:i:27 XM:i:1 XO:i:0 XG:i:0 MD:Z:52G38 foo_347/2 147 chr4 167906577 37 91M = 167906195 -473 TGAAGACATTTATGCCACCAACAGACACAGAAAAAATGCTCATCATCACTGGCCATCAGGGAAATGCATATCAAAACCACAATGAGATACC ??C=??ABE=CC=BCA@EBBAGGGFGGGGEEFGGEGGDGDGGFGFGGGGDBGGG?GGEGGGGGGGGEGEGGGGGFGGGGGEFFFFF=DBDD XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:0C4T85 foo_349/1 83 chr1 162223484 60 91M = 162223102 -473 AAAATTGTGTATTTTCATCCAATAATAGTTTCTATATGTTCCCAGGTAATTTCTTTAGCAGGACTTTCTCTTCCCCTTTCAAATAGTGTCC EFFFBFEEEEADDEEEEDFFFEEEE=FFGGEGGBGGGGGEFFFFBFDGGGGGGGGDFG=GGGGGFGGGFGEGGGGGFGGGGGGGGGGGGGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_349/2 163 chr1 162223102 60 91M = 162223484 473 AGAGATTTTTATCTCTCAGTGAAGTGACACAGCCATCTATCACTCAGGCTAATGTAAATGCTTTGGACAACTCTAAAAGGGTTGGGGGGGA FFFFFAFGGGFFGGGGGGGGGGGGEFGGGFGDFDGGGEGFGGGGGFDFFFGDGDBGFGGGFGGGGD=FFFGGGGEGEGGFFCFBEABCC>7 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_351/1 83 chr7 90537017 60 91M = 90536658 -450 CAGATTTCCAATGGAAACATTTTTTAAAATTTTTTGTTTGTTGTTTGTTAGCACAGGGTCTTGCTCTGTCACTCAGGCTGGAGTACGGTGG GAGGFGEEEGG?FGGGCFEFDFFFFFCEAA;EGGEGGFGGGGEFDGFFFCDDEGEGGFDGGEBGFGGGFGFFGGGGGDGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_351/2 163 chr7 90536658 60 91M = 90537017 450 GGTGAGTGCCTGTAGTCTCAGGTACTTGGGAGGCTGAGGTGGAAGGATCCCTTGAGCCCAAGAATTCAAGGCTACAGTGAGCTATGATTGT FFEFFEEFDFGGGFFFGGGGFF=FFEEBEEBGGF:C=CC:CCDDDEEEEEEEECAFFFFEGGFADGFEFEA5CCA:@>@??=???A@AAA# XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_353/1 83 chr20 20401005 60 91M = 20400634 -462 ACCCTCCTTGCACTTCTCAGGTGGTCCCTGCCACATTCTTTGATTTATTAGGGTCATTTCATTTGCTTTGGTTTCTGGTAGGCACTCTGAA ##########################CCCECFEGGGGCFGGGGFGGGEFCGFF?FFFDGGGFGGGGGDEGGGFEEGGGGFGGGEGFGFGGG XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:1A3A19A65 foo_353/2 163 chr20 20400634 60 91M = 20401005 462 CACAATATGTTTCAAATTATTATCAATACCAATTAGAAACAAAGAGATTCTAAGTTTTATTAGGAGGTCACTTTTTTTGCATCAATTTTAT ?4?3;CACCCGEGGGFGGFGGFGGGGGFGFGCGFGGGGEGFFGGGGDGGGGGGGGGGEFGGGGGEFFCFFDFFDFGGGGAAEGGDEGEGFD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_355/1 99 chr9 28773355 60 91M = 28773721 457 CAATATAAATGCTGTGTAAATAGTTGTTATACTATATTTTTAAATTTTATTATTTCTATTTGTATTTTTTAATACTTAATATTTTTTTACT GGGGGGEGGGGGGGGGFGGG5CCCCFDFFFGDGGGGGGGDGGBFGGFEGGG>GGFCGGGEGGBAGEEDCC>CEBEEFFFFDGGGFF>GGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_355/2 147 chr9 28773721 60 91M = 28773355 -457 AGATGTACAAAATCAATGTCTTAGTCAATTCAGGCTGCTATAACAGAGTGCTATAGACTGGGTGACTTAAATAGTAGAAATTTATTTCTCA CECCEA4EEEEC=DDDBCBEDGDGGFFFFF:FGFGFDFGGGFGGGGGGGFGFGGEGGGG?GGGGGDGGFGGFGGBGGGGBFDGGGGGDDGB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_357/1 83 chr9 28969310 60 91M = 28968967 -434 CATGGATCTCTCTAGTGTACCATATGGTGGCTTTCAAAGTAGCTATGGGTGGTAGAAAAGGTAAAAAGGGATATTTCCAAAAAGAGAGCTG FFFFBEGEGGGEDAEEBB:BBEE-FCEEEECEGDEFGFGGGGGGDEGGGGGFBFGGGGGEGGGGGGGGFGGGGGGFGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_357/2 163 chr9 28968967 60 91M = 28969310 434 AGGAGTCTTGAATCCACAGGCAGGAGTGAGACTGAGCACTAAAAAACAGAAACTTCAATACTATCAGAACTGTCCAATATTTCTCTCTTCT GGGFGFGGGGGFGGGGGGEGGGFG?C?:ACAAAA?FGEDFEFEEFGAD@FFDFAF?AEDDD?EEBAB-ECBCB?DBDBCBEE:CA@@CCBC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_359/1 83 chr9 28969310 60 91M = 28968967 -434 CATGGATCTCTCTAGTGTACCATATGGTGGCTTTCAAAGTAGCTATGGGTGGTAGAAAAGGTAAAAAGGGATATTTCCAAAAAGAGAGCTG FGGFGGEGFGGGFGGGFDCFFEAEFGFFFGGGGGGGGGFGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_359/2 163 chr9 28968967 60 91M = 28969310 434 AGGAGTCTTGAATCCACAGGCAGGAGTGAGACTGAGCACTAAAAAACAGAAACTTCAATACTATCAGAACTGTCCAATATTTCTCTCTTCT GGGGGGGGGGGGGGGGGGGFGGGG=C=C?CCDCCBGEFGFFGEGGGGGFGGEFBGFGGFGGBGFDA?CEACECEE5@BB@CGEGFFF?FDD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_361/1 65 chr5 58194288 37 91M * 0 0 TTATGAGGGAGAAAGAGAGATTGCACTGAATGGAGAAGTAAATGGTGATGAACAAGATGCGACACTATTCTTTCAGGTACTTTGAAAGAAA AC=CAACC==DD:DDEEBAEE?EAED:ECED?C5D@CBBDEE:BA:>7@>AC5??;:8<)CC@?>AA:A,6;,95CC=?@C66@:@*>9;@ XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:59A31 foo_362/1 83 chr20 39570998 60 91M = 39570641 -448 ATTCACTCACTGAATAAGTTATTTGATAAGTTCAACTAATTAGGTCTTTATACTAGCTACTAACCAACAGAAACATACAGTTCTACCTATG AE:?CEEFDFAEFEFFEEGEGEFEEEAFDDD?FFCFEGFEEFGEDEGGEGGEFFGGEGGFGGGGGGGGGGGGGFGGFGGGGGGGFGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_362/2 163 chr20 39570641 60 91M = 39570998 448 AAGACAGTGTGGGAATTGGTGGTTGTACCCAGTGAAGAAATGGGAAGACTCAAGAAATTACCCACCCAGGCTGAGCTGACAGAAGCCTTAT GGGGGGGGGGGGGFGGGFGDFFFFEFFFFF=EEEEGGGAGFFEEEFFFFFGGFGFFGGGGGGGGGGG>GGEGGGGGGFGFFFEFFCDG?AG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_364/1 99 chr14 26689993 60 91M = 26690365 463 GTGATCCTTTGGTGATATCACTACATTCAGATTTTTCATGGTGCCAAAATTCTTGCTCTGTTTCCTTGTCATCTGGAAACACCAGCACTTT GGGGGGGGGGGGEGGFFFFBGGGGFGGGGGGGGFGGGFGEDEGGGGGDAFGEGGEFFGGGEEGDGEFFBDEFFFDE=EEE?FBCFGBFGGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_364/2 147 chr14 26690365 60 91M = 26689993 -463 GAGCCAGCTGAGGCCAATGTGACTGGGTATATACTAGATCCTTGATTACTGGAAGAAGCTCTCCGTTGCCTCAGGCAATGAGCTGATTCCT >BCDC:EE:DDDDE5DDDEFFEEAGGF?FFDGGGGGEDE5EEFGFFGFFEEGEDGGDGGGGFGGGEGFFGGGFGFGGGDGGGGGGGFEGGD XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:49C41 foo_366/1 99 chr3 159878292 60 91M = 159878696 495 AGAGATTTTCAGTGTAGATGAAACATCCTTCTTTTGGAAGAAGATGCCATCTGTGACTTTCATAGCTGGAGAAAAGTCAATGCCTGGCTTC GGGGGGGGGGGGGGGEGEGGGGGGGGGGGGFDGGGGGGGGGGGGFG?GGEGGGEEGGGGFFGGEBFEECDFCGEFF?EFFFFDFD?=:@DA XT:A:U NM:i:2 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:72G6G11 foo_366/2 147 chr3 159878696 60 91M = 159878292 -495 TCCCCCCCCCGCCAATACAACATTCATTCTGCAATCCCTGGATCAAAGAGTAATTTTGACTTTCAAGTCTTATTATTTAAGAAATACATTT #######BEA?GGGADFFFFADGGDGEGGGGGGDEGFEDFF=FG=GGGGFGGGGGGDFGGGGGFGGGGFGGGDGGFGEGFGGGGDFGGGGG XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:1T2A0T0G84 foo_368/1 99 chr12 42102681 60 91M = 42103053 463 TTCTATACAAACCTTTATTGTGATTTTTAAAACTTCAATACATTCAACTTATCAAAAAAATAAAAATACTACATTTCTTCATAGTGCAAAG GGFGGGGGGGGGGGGGGGFGGGGGGGGGFGGGFGGGEGDGF:FFFGBGGGGGGGEGG=EGCCCCA=CCCFGGGFFGFFGGGDDGEG=GDFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_368/2 147 chr12 42103053 60 91M = 42102681 -463 AGGTTGATAAACCTTAAAATGTAAAATATCAAAGGATCAAAAACATCTAAAACTGATCTGGAATGAATTATTTGTAATTTTAATATTCTAT GGGGGFGGGGEEE=GGGDGBGEEGGGGGGGGGGGEGGFEGGGGGDEGGGDGFGFGGFGGGGGGFGGGFGGGGGGEGGGGGGGGGGGGGGGF XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:56C34 foo_370/1 83 chr21 15766367 60 91M = 15765965 -493 CTGGACTTGACCTTTTAGATATATTTTGGTTCTGGTCTTGGCTTTTGGGTAAGTAGCTATACAGAGTTCATCCCTCACAGCCAGTTCAAGC A?C@?CCBB?C@DDCAAEEEEDDEFDDFEF:EEEBBDFEEDAGGGEGGGFGGFEEGFFFDEECEFFEEEEEEEEEEEGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_370/2 163 chr21 15765965 60 91M = 15766367 493 TTCTTACCCCAGAGAAAGTGGAAAGTCAAACCCTGATGCCCATATTTGCCTGTATCATGTCCAAGAAATTATGGTAAATTGTGAATCAGTC GGGGGGGGGGGGFGGGGGEGGGGGGGGGGFGGFGGGGGGGGFFGGGGGFGGFGGGGGGFGGEGGGDGFGGGFGGEGEGGGGEGGDGGEGEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_372/1 99 chr4 146338059 60 91M = 146338451 483 AAGCCTCATTACCCTCTCAGAGGTGCCAGGGAGCATGCACTTCTCAGATGTCTTATCCCCAGTTCAGGAGGCCCTTCCTCCAAGGAGATGA GGFGGGGGFGFGGGEGDFGGBCC6CEEEEAFGFBE?FFFFGFGFEBFDEGGDED=FDEAFDEAEEEDD?A8>5A@=4C<:A,,??ADDDDD-DCC=5C?5AAA=A?A>AA=.=.=2;;1-2:=*>=C:@>?>C:ADADAAA6=::5?>=-/@2677(3?35+)70;<85;,36;3DDDD-?AAACFFC6E7 XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:11T79 foo_378/1 65 chr7 32304528 37 91M * 0 0 AGTTCGGGGAGGTGTGATGCTGAAAACCAGCTTCCGCGCGACGCCGGGTCGGAGCCCGCGAGGGGGGCCGGGGGGGACAGCCCAGGGGGCG DFDGGGEEGDEE;EEFFDDEGGGGGGAFGFFDFFFGAGGEFFGGGE=D@ECD:DBEEAEEABDBD$4&3?CA:??-B###### XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:65T21T3 foo_379/1 99 chr8 75728528 60 91M = 75728901 464 TAAATACAATGGCAAGTATAAAATAAAGTATTTTTTTGATTCCCCATTCAAGCAGTAATTAAATTACATGTACCCATAATGTACATTCCCA GGGGGGGGFGGGGGGGGGGGGGGGGGGFDGGGGGGGGEDGGGGGGGGGFGGGGFGGGDDGGFGGGGGDFGFFFDFFFFFFGGGGGGGEGGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_379/2 147 chr8 75728901 60 91M = 75728528 -464 GAGAGAATCGTATTTGGGGTCTACAGAAGAGCAGTAGAGATATGCTTTTTTCTTCATTTTCAGTTAGAGGAGTTTCAAGGAAAAGGTTTGG GGGGFG=GGEGGGFGGEGGFAGGGGGGGGGGGFGGGGFGFGGGGGGGFGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_381/1 83 chr8 53727412 60 91M = 53727042 -461 TACTAAAAATACAACAAATTAGCTGGGCGTGGTGTCACGCACCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCACTTGAACC 7AB7ACAC=>=AAAAC;;;;:5DADF@CDCF?@@?@?B0??C@@EAEFBFFECEEECGDGGGGFGGGGGGGGGGGGGGGGGF?FFFFFBFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_381/2 163 chr8 53727042 60 91M = 53727412 461 GCATATTCCACAATAAAAACTAATGTTCAGAAATTACCACTTGCTGAATTTTGATTTAGTAACAAAGAAGAATATCGACAATCATCTGAAA EE=EEFFEF5GFGGGGGGGGGEGFDGGGGGGGGGGDGGGDGFGFGFFFFFGGDAGGGEGGGGFGEGGGGGGDA=GFDFFDAFAFFD?EEEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_383/1 83 chr6 132570259 60 91M = 132569891 -459 TTTATTCCATTGTGGTCCAAAAGTGTGGTTGGTATGATTTCAATTTTCTTTTAATTTATTGAGAGGTACATTATGGCTGAGCATGTAATCA GGFFGEGGFGGEAFEFBDFFDFFBFFGGGEFGGEGGGGGGFGGGAGGGGGFFGGGGFGFGGGGGGGGGGFDGGGGGGEGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_383/2 163 chr6 132569891 60 91M = 132570259 459 TATGAATTTCTGGGTCTCAAATTCATTTATTTCTGATTTAATTTTAGTTATTTCTTTTCATAAGCTAGCTTTGGAGTTAGTTTGTTCTTGT GGGGGGGFGGGGGGEFFFEFGGGGGGGGFGGGFGGEFGGGFFFFFGEFGGGGFGGGF@GGEGGGGGEGGGFGG?DECFEE??DD@GGBGDG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_385/1 99 chr9 127316361 60 91M = 127316726 456 GTGCTCCCACACAGCGCCTTGTTCCTACCTTGGTAGGAACACTCATTGGGCTCTTCTATAATTTACCAGCTCAAGCTGCCTCGCTGCCTCC FFFFFFFFFFFFFFEFFFFFEEE9E>.@C:>2944?=5<.87782>A=>A1428-847<))7631::*:6@5>>@4;-71><>>?> XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_385/2 147 chr9 127316726 60 91M = 127316361 -456 AGGAGCTGAGGCTGAGAAGCTTGGCCCAGATTGTTGGTCAAGATTTGGATTTTCTCTCATATGCAACAGGTGCTAACAGAGCAGGAAAGTG 9EEEEE?GGGGGGFGGDEEE?DFFF=GFGGGDGGGGGGGFGG?GGFAGFGGGGGFEEEGEG?GFDGFDFFBEFGFEGGGEFGGFGGFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_387/1 99 chr13 100847627 60 91M = 100848010 474 TCCTGCATGTATCATGGTTAGGGGAAAGACAAGCACTGCAAGAGGCAAAAACATGTCACATTATATAACATTATAAAATATGTTCTAGATA GGGGGFGGGGGGGFGGGGFGFFFFDFGDDGEGGDGGGGGGEG=GGGGGGGFBCCFDFFFFGGGFGEEDEEEEEEEGFGGGFGEEFFGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:78C12 foo_387/2 147 chr13 100848010 60 91M = 100847627 -474 TATATACCTAAGACAATAGATCCATAAATATATGTATATTATATAAATATACACACGACCTACAATGTATATAATGTAAATATACACATAA FGFGEEEG?FGGGGFFEGFBBG=FGGFGGGGGGGGGEGFGGFDDFFEEEEEEBDAFGFGGGDGGGGGGFGGFGGGGFGDGDGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_389/1 99 chr18 12919076 60 91M = 12919448 463 GTTGGTGAAATACAACCAGCATTTTTCACGCCCATCGATTCCGCCCGTGGTCAGGAAATAAGAAGGGGCATAGCAGAGCAGCTCTTGTCTG GGFGGFGGGGGGGGGGGGGGFGGGGGGGGGEGGGGGGDGGGFGGGGGEGGDFBFFEEGGGGGGFFEEGAEEFEBEFBDGFG?GEEGEEEEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_389/2 147 chr18 12919448 60 91M = 12919076 -463 AAATACCACAGATGGGTGGCTTAAACAACAAATATTTATTTTTTTCTCACAGTTTTGGAGGCTGGAAGTCCGAGATCAAGGTGCCAGCAGG CCBCAEFDFEGGFGGFGGAFGGFGBGGFGGDGGGFGGGFGGGGGGGDGDEGG?GGGGGBGGGFGEGDGAGFGGGGGGGGGGGEGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_391/1 99 chr3 188423127 60 91M = 188423505 469 GACTTGAAACTATGGCTCCTTACTTTTAGTTAAGTGTTTTTTTTTTTTTTTCTTGAACTGATAGGAATATGAACTTCTAAGGGTAACTGCC DDDDDDDDDDDDDDDDDDD=C=@ACCC5C@C;>>:D@DDDCCCCCCACCA######################################### XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:50C30T9 foo_391/2 147 chr3 188423505 60 91M = 188423127 -469 ACCCCCCACCCAAGCCTCAAAATCCCTAAGGGAACAATACGGAGCAGGAAGGCCCAGTGCAGAGCTGAGGTGTCACTGTCTGTAGGTTATA ######?;A?-FFFFFGGGEGGGEGGGGGGGGGGGGGGGFGGGFFGGGGGGBFDGGGBGGGGGGGGGGFGGGGGFGGGGGFGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:1A89 foo_393/1 99 chr22 29351711 60 91M = 29352101 481 AAAGTGCTGGGATTCTAGGTATGAGCCACCCTGCTCGGCCTATAATGGCACTTTCCTATCCCATTGATGAGGCTCTACTCTCATGACCTAA FFGGGFFGGGGGGGGGGGGBFEDFDDGGFGGGDFFGFGGGGDFFGGEAFBECFFFGGGGGFE5FF@:@@?EEFEGFBDGFEE?DB8;C>?> XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_393/2 147 chr22 29352101 60 91M = 29351711 -481 GTAGTGAGTGAAAGCTGACTCCTGGGAGACTTCTGCGTGGTCCTGGTTCTCTCTCCAGACTGCACTGCGCAAGTTTCTCTTCCTGATGGTC ?B-EABDBEAEC::AA@AE5CCEFFEBFDDF?AAAA=5EE?FFFFFEEE=EBFCFFAC@C:EECEE=FDFGGDGGGFGGGGGGGGFGGGEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_395/1 83 chr16 31206541 60 91M = 31206168 -464 CTGAACAGCTTGTCATAATTCTTTGCCAATAGTCAAATTCTTCACGTGGTCTCTCTAAACACATACTGGCCATTGATGGTGTCAATGCCTG FEEF?FBBFGGFF?EGFEGDEG=EEGGGGEGFGGGGB?EEECFEGGGGGFFEGDGGEEEEEFDFBDGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_395/2 163 chr16 31206168 60 91M = 31206541 464 ATTTAGGCTCTTTTTTTGTTCCGTATTAATTTTAGAATAGTTTTTTCTATCTGTGAACAATGGCATTGGTAGTTTGATACAAATAGCATTG GFGGGGGGGGGFGGGGGBGGGGGGEGGGGGGGGADGGGGGFDFEEFFCDFGFGDGGGGGGFGGGGFFGDEFDDFFBGGGGGFFGGAGGAGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_397/2 129 chr2 114554339 25 91M * 0 0 CGATCATGAACAACTAACTTATTTCTCACAGGTCTGGAGGTTGCTATGTCCAAGATGAAGGCACCAGCAGATTTGGCATCTGGAGAGCGCC 673777<7,,,>67;,77775AA?D@?5AC7+76,;:A;;B6?@>9;;5<3?A:CA?A?######################### XT:A:U NM:i:4 SM:i:25 AM:i:0 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:2C28T51T3G3 foo_398/1 83 chr3 146074136 60 91M = 146073777 -450 GGACAGATGTGGAAATGGAGAAAACAATTTAAACAACATATTTTCTGGCTGTTGATTTTTTACTTGACCTTATGAGTATTGGTTAATTTGT GGGFGGGGGEGGGGFGFFGGGGGEGFGEEFGFFFGFGGGGFGGGGGGGFGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_398/2 163 chr3 146073777 60 91M = 146074136 450 GTAAGTTCTTTATATGTTTATTTTCTGCCAAGGTAAAAACTAAACCAAGTAAAATTGGATTCTTTATTTTAGAAAATATTTGTAATTCACC GGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGAGGGGEGGGGFGGGGGGGGGDGGFFGGGGGGGFDGGGGGGDGGGGEGGFGEGGGGFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_400/1 83 chr2 177102151 60 91M = 177101786 -456 CTTCCCCTACCCCCTCCGTCCTCCTCCCATACCCTCCTGGAATCTAGACCCAGTGATGGAGGATCACCCTAACAATGCAGCGTTCCCTCTC ######??*;FFFDFGGGGEGEGFGGEGFGCGGGGGGGFGGGGG?DGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:5A85 foo_400/2 163 chr2 177101786 60 91M = 177102151 456 CATGATAATTGTGTAAAACACAGGGTGCGGTGAACTAGTCATTTTGACTGCAAGAAAAAGTTATATTTAACTTGTTAGAGGTTCAAAGTTT GGGGGGFGGGGGGGGGGGGEFFGGGCEAEEBFFFFGGGFGGDGDDGGGFGEGEGFGGGGGEGGGFGFGFEGGGGGGGFGGGEGGFDFEEFC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_402/1 99 chr22 25528389 60 91M = 25528784 486 TACTGGGAACCAGGCACATATATTATCTCGTTAATCCTTATATCCACCCTGAAAAGAAGGGGTTACTGTTAGCTCCATTTTACAGATGAAG ?(666*..77645?6B6=2BBAAA:BB@@B40:BB5::57B################################################## XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_402/2 147 chr22 25528784 60 91M = 25528389 -486 GAGTTAGAAGCTATTACCACGCCCATTTTACAGATAAGAAAGCTGAAGCCCCATGAAGAGAAAGGATCCTACTGCTAGTAAGAGAAAAGCT ###########################################A>>>AA:CB=:B=?@@6=737675;>:>=AA>?:;==>?=;?9A?5=: XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:16G24A49 foo_404/1 99 chr3 175457907 60 91M = 175458291 475 TATTAATTGTCCTATTATAAAGGTACATGCACACATATGTTCCTTGCAGCACTATTCGCAAATAGCAAAGACATGAAATCAACCAAAGTGC FEEFGGGFGGGGGDGGBGFFGGGGGEGFFEFFFEDFFFFFGFGGBGGGG=GDGGGGGGGDFGFGGGFGGGDGGGE=AEEEGGFGGDFGEDF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_404/2 147 chr3 175458291 60 91M = 175457907 -475 CATGTGTGATGGTTAATACTGAGTGTCAACTTGATTGGATTGAAGGATACAAAGTATTGATCCTGGGATGATCTGCGCAGCAACTTTATAT =5?DCFGGFGAFFBECC=BDDCAEEAEECEDDFAFDEEEAEDDDDDDD?C?DEFFF?FDFBFBFFFFDFDDC?ACADDDD?EED?EEFFBG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_406/1 99 chr1 16990046 15 91M = 16990439 484 CACACAGCTAGCTAGTAAATAAACTTTAAAAAATTGAAGTATTATGTACATAAAGACAGGCATAGTGTACAACTAAGTGGGTTATCATAAA GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGFGGDFFFFFGGFFGGGFGGFGGGGGGGEFEGEEGGGGGGFGGFGGGGGGGGGGGF XT:A:R NM:i:0 SM:i:0 AM:i:0 X0:i:2 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chr1,+16990046,91M,0; foo_406/2 147 chr1 16990439 23 91M = 16990046 -484 TTTATTTATACATTTTTCATTTTGCTGGTTATTTGAGTAGATTTTAATTTTGCCCCATGACAAATAATGCTACTTGGAATGTTCTTGGATA GGGGGGDFFEGGGGEGGGGGFGGGGGGGFEGGGGGGGGGGGGGFGGGGFGGGFGGGGGGGGGGGFGGGGGGEGGGGGFGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:23 AM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chr1,+232993482,91M,1; foo_408/1 83 chr5 104564853 60 91M = 104564470 -474 GAAGAAGTTGAATCTCTGAATAGACCAATAATAGGTTCTGAAATTGAGGCAATAATTAATGGCCTAACAACCAAAAAAAAGTCCAGGACCA EGG:GFGEGGFFFFDFEBEECGEEGEGGFGGFGGGDBFDGGGGGGGGGG?GGGGGDFGGGGEGGGGGFGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_408/2 163 chr5 104564470 60 91M = 104564853 474 CCACCCTAACATCACAATTAAAAGAACTAGAGAAGCAAGAGCAAACACATTCTAAAGCTGGAAGGCAAGAAATAACTAAGATCAGAGCAGA GGGGGGGGGGGEGGGGFGDGGGEGGGGGGGGGGGGGGGGGGDGGDGGGGGGGGGFGDFGGDGFGAGGFDEFGDGGGGDEGGFDGGGGGFGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_410/1 83 chr2 135252137 60 91M = 135251766 -462 GAAGAGGAAAGGGGGCAAAAGATCAAATGCAAATGCCTTGGCTTGTCCTGGGGGGTGGAAACTTCCAGAAGACGTGTAATGAGCCCACTCT GBDEDGDBE=GGGGGEEDAEEEFGEDAFGEGGE?EGBEBF?EAGGEEGGGGGGGEFEFFFFGGEGFGGGGGFGGFGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_410/2 163 chr2 135251766 60 91M = 135252137 462 CACCCGGCCGATATAATTAAGACTTTTAAGTTGAAGGCTCATTGTTCTCTAGTACTGGTACTTGAATCAAGGAATGCTATTTATCATTACC GGGGGFFGGGGFGGGGGGGGGGGGGGGGDGGGGGGGGGGGGGGFGGDGGGEGGGGGDGFGGGFGFDEFEGBGGGBGGFEEEGGGDGGE=GG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_412/1 99 chr2 85234289 60 91M = 85234672 474 TTAATCTCTACACTGCACTATTCCTAGGACACTAACTTATGTTTAATTTTGGATTGCTAGTTTTCTAGAATACATATTCTTAAGTATGTTG GGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGFFGGGGFBDGGGGGEFGFFEDE@EFFEEEEEEEEEGEEGGGGGEGGEE=ECEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_412/2 147 chr2 85234672 60 91M = 85234289 -474 TCGTCCCAGAATTGAGGCTGTACATCCTTTAAAAATAAATAAAGCCGAGGTCTGTGCTAGAGTTAGAACATTGAATCTGGCTCGCTGGTTT AA??@?DGGGFEGEFGDFEFBEGGEGFFDFGGGGGGGGGDEGGGDGGGFGGGGGGGGGGGGGGGGGGGFGGGGDGGGGGGGEGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_414/1 99 chr11 83587338 60 91M = 83587723 476 AATAAATGGTATACCTGAATATGAGTAAAATAAAAATTATAAAGTGGAGTGCTAACCCTTTATCACTATTGGGGAGACAAGGACTTAGAGA GGGFGGGGGGGGGGGGGGGGGGGFGFFGGGGGGGGEGGGFFEFFCFFEFCGGGGGGGFGGGGFGFFGGDGGGFGEGGGEGGGGGGGFGFGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_414/2 147 chr11 83587723 60 91M = 83587338 -476 GTGCAGTATTATTTGCACGTCATAAAAAGTCTTCCTACCTTATTCTTTCCCAAGTTTACTGAGAAAAGGGAAGTGAAGATAGGCTTGAAAA EEGEGGEEFEEGFEFEEGGGGGGGGGGDGGGGFGGGEGGGGGGEFGGGGGGGGGGGGGGGFGGGGGEGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_416/1 99 chrX 65347845 60 91M = 65348206 452 ATTTGTACAAATGTAAGGAGTATATGAGAAATTTTGTTACATGTATATTATGTATAGTGATCAAGTCAGAATATGTACAGTGTCCATCATC FEFDFFFDFF?EECBCCCCCBEEEEEEBEED,DDDECCCEFF=DFE=EEEFFDFEEEBE-@>C=C==@@>EE=EE@;@@>ADBACBD55A7 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_416/2 147 chrX 65348206 60 91M = 65347845 -452 TTTCCATCCATGTTGCTGCAGATTATATGATTTCATTCTTTTTTATGGCTGAATAGTATTCCATTGTGTATATATACCACATTTTCTTTAT ED5:BC?B:5?DEBEF?BDAD=?@;>@;@==DB=<>BB?BE@@@<>ADDD?DDBD?DA?BDFEBBFEEEDDC?CCAACAC=8;>=:;;9@> XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_418/1 83 chr7 13465890 60 91M = 13465514 -467 GCTGGAGAATCTCCCTACCTTAAAAGAGCTTGGTGTTGGAGAAGCCAAGCTCTGTGGGGCCTTGCTACTGAACACACCAGAGCCAAAAAGA ############AA:>;>=-7CCBAB:C@DDFCCEEAFDECEEEEAEFFFDGDBGGGGGGGGGDGGFFFFFFFFFEFGGGGGGGGGGGGGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_418/2 163 chr7 13465514 60 91M = 13465890 467 AGGATCGAAATATTGAGAGATTGCCTAGTAAGAAGTAAGGAGAACTCTAAGGAACATAATGAGTGCAGATATAAGAAGACATTTTCCTTAG FDGBGFGDGGDFGGGFGGGGGGFGGGGGDGFDFFBDFEDFEEEEEGFDGFGDFGGDEGBEFGGDGFFF5FF?FDFGGGGGFGFAFEFGBFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_420/1 83 chr11 98265841 60 91M = 98265465 -467 CATGAAAACAGCAAGGGGGAAGTCCATTCCCATGATTCAGTCATCTCCCACCAGGCTCTTCTTCCCACACTGAGAATTACAATTCAACATG FGFGEGGEGFGFFFGGEDGFEGAFGD?GGFGEGGGFGGDDGGEFGGGGGFGGGGGGGGFGGGDGGGGGGGGGGGFGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_420/2 163 chr11 98265465 60 91M = 98265841 467 ATTTATTTATGAAGATGAAATATCTCTCCATTTATTTATTTGTTTGATATCTTTTTATCAGGGGTTTATAGTTTTCCTCAAATGTTTTTAC GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDGGGFGGGGGEFGEGDGFGGEGGDGGGGGFGEFGFGGGGGGGBFGDFGGGGFGGFG6E XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_422/1 99 chr14 70232998 60 91M = 70233391 484 TCTTAGTGTTATGAGGTGCTGTGGAGAAACAGAGATGCACAGGATAGACCCCAGCCCTGCGCAGTAAGAATGGGAATAAGAATTACCGCTG EEEECEBEDEEDE::@9>??C>A?A>>B9>BDBDCBAEEEEAEE=D=:B?>@:@>BA:A-1..60??<:1-9;5):,5?7>@@C####### XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:59A26A4 foo_422/2 147 chr14 70233391 60 91M = 70232998 -484 GATTTGCTTGAACCTGGGAGGCAGAGGTTGCAGTGAGCTGAGGTAGTGCCACTGCACTCCAGCCTGGGCAATAGAGTGAGACTCAGTTAAA #################@:69AAA5A?A-A?AA?A??A:B;@CCC??EEEAEE?E5F XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:2A88 foo_424/1 83 chr4 117406443 60 91M = 117406043 -491 TAGTATTACCATGCCCTGTTATTAAGGTCAATGGGAAAATACAGCAGCCCAATCCCGGCAGGACTACAAATGGTCCAGATCCTTCAGGAAT GGGEEFE@GDEFGFDGGFEEEEEGEFEEEEEDEEEGGGGFEGEGGGGGEGFGEGGGFFF5FGFGGGGGFGGGGGGGGGFFGGGGGGGGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_424/2 163 chr4 117406043 60 91M = 117406443 491 CAATATATGGTACTCTTTCTCCCATAGCCAGGATTCCCGGGTCCACGAATCAAGGGGTGGAAGTAGAAGTGACACCACTCACCATCACCCC GGGGGGGGGGEGGGGGGGGGGGGGGGGDGBGGEGGGGGGGGEGGGGFGDGFBGFBEE>EA>DCCCECECBCBBB5DCBAADFED5EEEDEG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:37T53 foo_426/1 83 chr2 108096607 60 91M = 108096235 -463 AGTCCCCTTCTGAGGGATATGTACAGATGGATTTCCCACCTTGCCCGGGATCTTAAGGCTGGCACATGTAAAACTCCTGGGTTTTGGTAGG BGBGAEGGGGGGGGFFEFDEGFEEGGF:FDF=FEFFFCD?FGGGGGGGGEFGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_426/2 163 chr2 108096235 60 91M = 108096607 463 TCTGTGGGTAGAACTCTTGCTGGAGTGGCTGAAGCCCCCACAGGAAAGTCCCACCCAGTGAGGAAGAATGGATTCAGGGGCCCACTTAAAG GGGGGGGGGGFDGFGGFGGGFGGGG?FFFFC@C--DCDD=>)@:>8>@9>?5A>>A-:?################# XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:79T11 foo_428/1 83 chr16 3069959 60 91M = 3069588 -462 TTCCCCATTCCCACATACGGCACAATCAGGAGCTGTGATACACATTCTCCTGGCTCTGCTTTCCAGGGAACAGAAGTAGATATAACAGTTT E=DDCCCB=EEECDEECEEDFCCCC=DDGDEEGGGGGGGEGEGGGBGFGGDGFGGFGGGGGDGDGGGGFFFGFGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_428/2 163 chr16 3069588 60 91M = 3069959 462 CTTGTGCTCCCCATTGTTGTAATAAATCTCTTCTCCATAAATTTATAGGCACAGAAGTTATAATTGGTTGAATATTCCCAGGTTGTCCATC GGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGFEFFFFGGGGFFGGGGGGGGGGGGGGGGGGGFGGGEGGGGGEFEFDDFFFEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_430/1 83 chr17 7727714 60 91M = 7727353 -452 CCCGCTTCTCATTTAAAAGATATGTACACAATTAAATGCGTTGTGACTTCAATGAGGTATGTAGAAACAGAGTTATATATAAACATATATT =GGBFGEDGFGEEEDEEGEEEFFCFDBBEGGGBGGFEEGGGFGGGGGGFGGGDFGFGGGFGGGDGGFDGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_430/2 163 chr17 7727353 60 91M = 7727714 452 GAGATCATGGAGGGAAGGCAAATATACTGAAACGGAAGAAAAGAGAACATAATTTCACGATCTTCAGATTTGACTCTTTCCTCTTTATCCT GGGFGGGGGGFGGGGGEGGGGGGGGGGGGGGEGGGGFGGGGGGGGGFGGGGGGGGGGGGGGGGFGGGGGGGGGEGBEFEAGFGDGEEBFEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_432/1 99 chr9 7161074 60 91M = 7161455 472 TTTTTTCCAGTCAGATGGGATCCTGACAGCAGAGTAACTGCATCAGGTGGGAACTGACTTTCCTCTCAGCTGCATCCTTCAGAGGAAGGGC GGGGGGGGGGGGGGGGGGGGDGGGGGGFGEGGGGFGGGGGGGGGGGG@GGFAFFFGEFGGGEGFGGFEFGE=EGEGGEGGGFGFGBEEE## XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:89T1 foo_432/2 147 chr9 7161455 60 91M = 7161074 -472 CTCCCCTCTTCCCTGCCTTATTCCCAGCTCCTCCCGCTCCTTTCTTTTCCTTTTTCAGCACCTAGAGATTTAGAGGATGGGGAGTGGAGTA A-7??6.<.6=EB::BFEEEF:FFFEGG:GFEGGFGFDGGGGBFDGGFGGGFGGFGGGG?GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_434/1 99 chr14 53456345 60 91M = 53456729 475 AGGGGTAAAAAATTATTTTCCATCAAATAATAATCATTCTCCTAAAATCTTAGTCTCGCATTCCTTGAAATTAAATTTTTTCCTAAATTTT FFFEDEGGGGGGGGGGGGGGGAGGGGFEGGGGGGGGGEFDGGEGGDFGGGEFEEGEF:DADECECGGADDGGGFFGGGGGGGGGE=FFFDG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_434/2 147 chr14 53456729 60 91M = 53456345 -475 TTGATCTAACACTCACGTAAAATCTGTTTACTAAGCTCATCCGCAATTAAATAAATACAAATAAAAACAACAGAGAGATACTTTTGCCATC GBEBBEEEEE@GFGFFGFEEGDGBGGGBDFGEGEFGGDGDGGGGDGGGDGGFFGGDFDGEGCGGGGGGGGFGGGFGGGGAGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_436/1 99 chr6 62612893 60 91M = 62613264 462 CTACTTGACAAGATAAACTCTAGATATTTTAATTATTCAGTTTTTACTTAAAATGGTAGATTTCATGACATATTTTGTTTCGGAATTATTG GDGGGGGFGGGEFCFGGFDDFCCFFGGDGDFDEBAEFFEFEGGE4CBBBCFGGGE@:=@?@BB=AED@ADBBBBBEB??>(:154BBCCBB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_436/2 147 chr6 62613264 60 91M = 62612893 -462 CCCGGTTTCGGGCGATTCCCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGTGCATGCCACTACGCCCAGCTAATTTTTGTATTCTTA ###################?9E=EEDC>=@;AACA?CAC:CAAAB?EADEEEEE:ECAC5CDDDD?DEEEDFEFFFEBFFDEEE?EDFFFF XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:5G4A7T72 foo_438/1 83 chr1 176775580 60 91M = 176775191 -480 TGTCTTTAAACCTAGGAAATAAGGCCAGGCATGGTGGCTCATGCCTGTAATCCCAGCACTATAGGGGGCTGAGGAGGGTGGATCACTTGAG GGEBGGFBGDGEBGGGB:DFEDEEGGEFEGFGGGGGEGGBFGGGGGGGGGGGGGGGGEGGGGGGGGGFGGGGGGFGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_438/2 163 chr1 176775191 60 91M = 176775580 480 CAGGCTGTGGTGAGGATACCGTGGCCGGGACTGCTGCCACAGGACACGGGATGCTACTCTCACAGATGGTCTCCTGGCCTAGCTGCCATGG GGGFGGGGFG:FFFFGGFFGFDFFDEDEE:EED=CBBEAC=BDBAADDDDB?BE?CCCC=ECEEA=CBC@DC?E:BC5E-@??>=AD?AAE XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:47A43 foo_440/1 99 chr2 4083031 60 91M = 4083401 461 TAACTCCTGCCTTCAGAAGCAGATACTGAGCCTCAAATCTAAGATTGCCCTGAGTCAGAGTCTTATTTCCTGTAGAGAAAGAGCTGACATT GGGGGGGGGFGGGGGGGGGGGGGGGFFG?GDGGGGGFGFGEGGGGEEFEGFFEGEGGFEEEGEEEEGGEGEECDEGGEEGGEEGEFDDAEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_440/2 147 chr2 4083401 60 91M = 4083031 -461 TTTTGTCTGAGGAGATAAAACCTGTGCTACCTGAGGCAACAGTGATGGCCACCCCGAGGCAATTGCTAGGCAAGATAATGTTGATTCTCCT AE:EEACBE5EADBCACCC?CD=D5@DCAD?EEFGGGEGBGFEEEDD@@C:GEGGGFGGGGAGFFGEGEFGGGGGGGGGGGFGGGGGGFGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:19C71 foo_442/1 99 chr3 116679418 60 91M = 116679795 468 GGGGCTAGTCTAGTCATCTTCAAGGAAATGATCATGAACAAAGAGTAAAGGACACAGGCATTTGGGAGTTGTTAGGAGAACTTTGACGTTA FFFFFFF??FFBFDFFDE?FDFDDFDEEAECEDEDF?DCEDDD=D@CEEED:EEEFBDFDFEFEFEBE@BC@C?BCC=?C=ABC::BA>A= XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_442/2 147 chr3 116679795 60 91M = 116679418 -468 GGTGCCTTTATCCAGGTAAAGAACAGTGTATAAAAAAGGGCAACAGATCTCCAACCAGAGTTGAGTGGTTGTGCAGAGACTATATGGCACA DBDEFFFF:EEEEAEBBEBEDF=FF?FGDGDEGEGGFDFDEFFDFFFBDEFGGFGFFF=FF=FFEADGGGGDBFFGDGDGDGGGGGGGFGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_444/1 81 chr4 48956355 15 91M * 0 0 AAGGTGTGAGCCACTGTGCCTGACCGGTGTAACCACTTTGAAAAACAACGTGGCAGTTTCTCAAAGACTAAATGTATAGTAATCACATAAT #######?@@C???@A?@C@A<8><2BABCCEGGCGGDGGGGGGEGGGGGFGGGGGFGGGGGGGGGGGGGGFGGGGEGGGGGGGGGGGGGG XT:A:R NM:i:1 SM:i:0 AM:i:0 X0:i:2 X1:i:1 XM:i:1 XO:i:0 XG:i:0 MD:Z:25A65 XA:Z:chr4,-48956355,91M,1;chr4,+11734865,91M,2; foo_445/1 99 chr4 22642402 60 91M = 22642765 454 TGCACATCTGTTTTTGTTGATTGCATGGAATCTGCTGAAATATTCTTGAAAGTACAACTAGAAATAGGTGTAAAAACCACCTTTCCATTGA GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFEGGGGGGFGEFGGGGGEGFGGGGGGGGGGGGGGGGGGEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_445/2 147 chr4 22642765 60 91M = 22642402 -454 AAAGAGAGACTTGAGGAAGAGAGACAAGAAAGCTCTTGGAATGATTGAGAAGAAAATTAAAACGCCCAAAGTAAAGTACAATCTTATGTTA GFGGGGGGGFFGGGGGGGGGGFGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGDGGGFGGGGGFGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_447/1 99 chr3 9479055 60 91M = 9479407 443 AGGTCAAGAGATCAAGACCATCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGTGTGGTAGCGAGCAC GGGFGGFGGGFGGGGGGGGGGGGGGGGGGGGGFGGEGEGGFGGGGFGGGGGGEGGGGGGGGEEGGE=EDGEA?:?################ XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_447/2 147 chr3 9479407 60 91M = 9479055 -443 ACCCTGTCTCTACTAAAAATACAAAAATTAAGAATGTAGATATATGGGAGCAAGAATACCTGTTTTGTCTCTCTCCACACATCAAGTTTCT AFFF=FFGGEFGGGDGGGGGEGGGGGGFGGGGGGDGGGGGGGGGGGGGGGGGGGGEG=GGFGGGFFGFGGGGGGGFGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_449/1 83 chr18 32443236 60 91M = 32442861 -466 AGAGGCTGCAGGGTGGAACTGGGAGGGGAGGCCCAGGGACAGGTGGAAGTGTGGTAGAATTTGATGAGCCTATTAGAGAATCAGCCAAAAA =FEEEFECFFFEEEDEC@B3=CFCCEEEBEDAEEFFFEFFBFFDFFFFFFFDFFFFFFFFFFFDFFFFFFF=FFFFFFFFBFFDFDFFFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_449/2 163 chr18 32442861 60 91M = 32443236 466 ACCTGTTGAAGGAGATTTGTGTGTTTATAGCTTTTGGCTTCCACAGATAGAGCTGCTATGAACATTCATGTACTGGCTTTGCCACAATGGT DGGEGGDGFGGFGFDEEBEEEEEEEFFDFFGGGFDGGFGGFGADEDF?FDBD=E:D?DB:DDE=EEABEBBDA:5==.?>8.?5?###### XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_451/1 99 chr4 75590706 36 91M = 75591067 452 AGCAATATTTGCTGTTCTGCAGCCTCTGCTGGTGATACCCAGGCAAACAGGGTCTGGAGTGGACCTCCAGCAAACTTCAACAGACCTGCAG GGGGGGGGGGGGGGGGGGGGGGGGGGGGFAGGDGGDGGGGGGGGGGGGGGGGEGEGGEEAEFCDFEGGEFEGGEEGGFDGG?FGEFEEFGE XT:A:R NM:i:0 SM:i:0 AM:i:0 X0:i:2 X1:i:11 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_451/2 147 chr4 75591067 37 91M = 75590706 -452 TAGGCTTCAGAAGGTCAGTAATAACAAACTTCTCTGAGCTAAAGAAGCATGTTCTAAACCAACATAAAGAAGATAAAAACCTTGAAAGAAG GFGAGAFGGGGFFGGGFGAGGGGFGGGEGGEGGGGGFEDGGGGGGGGGGGGGGFGGGGGGGGGGEGGGGGGGGGGGGGFGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_453/2 145 chr10 103682977 37 91M * 0 0 TATTTTTATCACAAGTAATAAAAAATAGGGCAAACATAGAAAAAGACTCTAAGAAAATTTCAGGATAAGTTTAGGAGGGGGTTTAGATGCT ####@7>?>AC;:.84:;A:+;40;A:?-?@@@@==7:=.5ED=CEBEEEDDAEECCC@A??D5DACB=EEEEDAAC:C:BD=DD?=CCA?FEDFEEDEE=EDDEDFEFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_454/2 163 chr5 38679464 60 91M = 38679842 469 GCACTGTCCATCTGTCCTGTGGCTTCCCAGGAGTTTAAGGAGTTTACTTAAACCCCTTTTCCTGGCTTTGCTCTGTTCTGCTGTTCAGTCT FC@CEEFEF?D?EEEDDEEE==:CCAEE:D;-;85A?A?A5AAAA?BCA:CD-DDEDEEEBD=?D6==C-;;;+;=@<-,;@@<@?5:CC= XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:52A38 foo_456/1 99 chr2 142269256 60 91M = 142269639 474 TAGGATATCTTTCTGTAGGTTAAGCAAGGTGAGTACCATTTGTCCTGTTTTTTATGCTGATCATAATCTCCTAGTCTCCATGCTGTCACAT C?CCCACC;?C?@@B4/*73@3096CCC;A6@6)C8:*:;4'5'56(5=0@0;22>C@?@9+:):?B3=?################# XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:17A23A1A2T44 foo_456/2 147 chr2 142269639 60 91M = 142269256 -474 ACTGGGTGGCACATGTACAAATTGGAAAGAACAGAAAATAAGATGTAAGCCAAGGGGATGTCAATTCCAGTCACGGTTCTGATGTTTATTA ###########################BFFFC93@C;3*455@42*A:A178.:0<>9C:@.6B:B:>6><>:@C@AA?5ADDD>BD:DDD XT:A:U NM:i:1 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:38G52 foo_458/1 99 chr11 41351080 60 91M = 41351473 484 ATAGAACTTACTCTATTTCTTATTTAAAAATCTAGAGAAAGAATACTTTTCCATTTTTGTACAGCTATGACTATTCTAACACATCCATTTA GGGGFGGGGGGGGGEGGGFGGGGGGGGFGGGGGGGGGGGGGGGFGGGGGGGGFGFGGGGEAFFGGGGGGEEFGGFGGAGEGGGEGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_458/2 147 chr11 41351473 60 91M = 41351080 -484 TATTTCTTAAGATATACGTATACATACTTACACATCTAAATGTAGTACACTGTAAAACTGTCAGTTAAATTTTTAAACTATTAAGAAACAG FGGGEGGGGGGAGGFGGGFGFGGGGFGGGGGGGGGGGGGGGGGGGGGGGFDGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:49T41 foo_460/1 83 chr9 117548613 60 91M = 117548242 -462 GCATTGTGAGATTTCATCCTTGAGAGCATCTAGCAGTGGGAATGATTTAAGATGACAGCACAGCTTACTGTTCTATCCTTCAAATCTTTCC GGGGGEEEFEEDDGBEEEFGEGEFGAGGEFGGG=EGGGFEGGGGGGGGGGEGGGDGFFFDAEBFGGEGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_460/2 163 chr9 117548242 60 91M = 117548613 462 AGTTAAGATGTGGAACAAAAGGGAACACATAGATGCTGAAGTTAGGAGACAAAGATGATTTTGAGAACAAAAAAATTCCTAGCAACTAACA GFGGGGGFGFGGGDGGGGFEGFDGGGGFGFGGDGGGFDFGDGGEBGGGFGGFBDGGGGFGGGGBGGGFGGGGBGDDEGGFEGGGDGGGGED XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_462/1 83 chr1 101394399 60 91M = 101394004 -486 CCCTCTTTCACTCTTAACAGTGTCTCGGTTTGGACAATAAATTACATGATCACATTAATTTCTCCCATATTCTATGAGTGAGAATTCAATC FEDEFFDDFAFFGGDGCEEEE=EEEGGGFEFGEGEGGGGGGGEEGGFGGFGEGGGGGGGGFGGGGGGGGGGGFGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_462/2 163 chr1 101394004 60 91M = 101394399 486 GAAAAACTGACATATAAACAGGCATTTAAAAATGTATGAGCATTCAGTTTGCTAGAGTTTAAATTCAAGCTCAAGACTGATGAAAGGAGAG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGEGGGGGGGGGGGEGGGGGGGBGGFGGGGGGGGGGBFFGGGGGGGFEGGGFGFGEGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_464/1 83 chr5 60820050 60 91M = 60819675 -466 GGCTGTAAATCCATCTGATCCTGGGCTTTTCTTTGCTGGGAGATTCTTTATTACCGATTCAGTGTTGCCACTCATTACTGGTCTGCTCAGG GFGGGGGGEEFGFFGGGGBGFEGFBDCCECGFFGEGGGEGGGEFGBGEGEGGEFGGGFGGGGGGGGDGGGGGGGGGGGGGGGGGGGFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_464/2 163 chr5 60819675 60 91M = 60820050 466 CTGAATTTTACAAATGGTTTTTCTGTCTATTGAGATAATCATATGGTTTTTGTTCTTTATTCTGGTATCACATTTCTCAATTTGCATGTTA GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDGGGGGGGEGFGGGGGGGGGGGGGGGGGGGGFGGGGGGGDGGG?FFFFFGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_466/1 65 chr18 23435820 37 91M * 0 0 TAACATATATAAGTGCGTGTGTGTGAGAGTGTGTGTGTGTGTGTGTGTGTGTATTTTTCAACCTCAGGAAATTCACAACATGCTGCTTTTT GGGGGGGGGGGGGEGGGGGGFFFFFCECEBECECEFFEFFECEEEEFFFFFDEFFGGEEEEDGDEBFDACCDBEBGFFG=AD?DCEGGGEG XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:44A46 foo_467/1 83 chr2 198599324 60 91M = 198598930 -485 GTTTTGGGAAAGTTAGATTTAATGGTTAAGTTTCTTATAATTGTGAATGACTTAGTGAGTTAATTGGAGGGAGAGAGAGAATTAATTGAAC GEEGGGBGGGAGGGEGGEEGEEDEFAEGFEDFEEGFFGGFEFECGEFFF@FGGGGGGGGFGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_467/2 163 chr2 198598930 60 91M = 198599324 485 AGTTAATCATATTTGCAATGGGGCCATTGTTGTCTCTTTATAGGTTTAGTTTTAAGGCTGGCATTAAAGAGTTTTGCAAAATTATTTATTT GGGGGGGGGGGGGGGGFGGGGGDFGFGGGFGBGGGGFGGFFFFBFGGGFGGGGGGGEGFGFDDGFF?EFFGCGFGFFFEEGGGBEBDDFFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_469/1 83 chr2 214053692 60 91M = 214053296 -487 TTGTACACTTTCCCTTTGAATATTTTTAGGTTTGCAACACTTTTATGTGATTATCATAGTGTTAAACAGAAGTTATAATTTACTGTCCAGG FCDC:CAEFFEDADDCBBEC?GGBGGFGEGGFGBGGEGDFGGGG=FGGGGGGFAGFGGGGGFDGGGGGGGGEEEE?GGDFGFGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_469/2 163 chr2 214053296 60 91M = 214053692 487 GTACTATTAGGCATATCCCCTTAGCCAATAATACAAATAAGTTAGCCTTGCTACAGCAAAGTTTTGCCTTGCCTTTGAGGAATGGAAGCAG EDEE?EEEEBF:EFFGGGGFGDGGGGGFG?GGGGGGDGGGFBDFFAFFFFFG?GGGGGDGFEFFFGGGGFGGGGGGGDGEGGGGABFFEEF XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:32G58 foo_471/1 99 chr4 100638765 60 91M = 100639152 478 AGTACCCACTGTGTCTCTACTCGAATCTCATGATTCTTGAGATGGACAACTCAGCTCCTCAATAGAAATGGGTATGTTTGGTTCCCACACC GGFGGGGGGGGDEGGGGGGGGGGGGGGGFGGGGGGGGGFGGAGGFGFGGGGGFDGCGGGGGGGFAGEGEFCE?CBEEEECE?EBEFFFEDC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_471/2 147 chr4 100639152 60 91M = 100638765 -478 AGTTTATTAAGAGCTAGTTTTCTACCAGCTTTGAAACGGATTTTCTATTTCTTCATTTGGGCACAAGATCATGAAGTTAACGTCTATTTAT BFDEBBGGEGEGFEGGF?AFFEEEGGEGGGGGGGGGGGGGGGGGGGGFGGGGGDEGFGGGFFD?GGDGGGDFGGGGGFGFGGFDGGGGGDG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_473/1 99 chr1 187310324 60 91M = 187310672 439 GACCTTTTTGGCCTTATTATGCATTTTCTTGCTTAGATTTAATTTAACTTAGATTTCTACTTTGTGCACAATAGCTACTTAGTTCCAGGCA GGGGGGGGGGGFGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGFGGGGEEGGGFGEGDDGGGFFFGDGGGGFGEGEGGGGGA XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_473/2 147 chr1 187310672 60 91M = 187310324 -439 TCCCAAAATTCTAGCTTGTATAGCTGCTAAGGAAGAGGTTCAAAACTTCAGTTTTTATACATGGGTCTGAAATTTAAGAAATAATAACTCA AGEFGGGGGEGGFFEFGGGDGGGFGGGAGGGGGGGGGGGGGGGGGGFGGGGGGGBGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_475/1 99 chr20 52340463 60 91M = 52340837 465 AGTTATCCTTGAATACTAGCGTGTACTCTTAAGTCTAGTTGCCTGAGTTCAAAGTCAATTGTCTGGCTCTGCCACTTACTTGAAGGGGCTT GGGFGGGGGGGFDFGGGGGFGEFEGFFGGFFGG?GGGGGGGGGFGFFGGGFD?DFEDD=BFEFFFGBBGADBFE?DED=CDDAEECD,A?D XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:87A3 foo_475/2 147 chr20 52340837 60 91M = 52340463 -465 AGATGGCGATGAAAGTGACCTCTGGTCGTCCTCACTGCTCATTATATGCTAATTATAATGCATTAGGATGCTAAAAGGCCCTCCCACCAGC BFGEGEEFGBGGEGDBEBGEEGDGEGEFGGGFGFEGDGEGGGGGGGFEFFEGGFEGFFF:DFGDEGGGGDDGGGGGF>@FFFGFGDFFFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_477/1 83 chr10 12108214 60 91M = 12107819 -486 TTAAAAATTGTATTTCACTAGTTTAAGGTGAAGAAGAAGTGGCTTGTCAGCAGTTCATATGAGAAATAAAGCTTTAGTGCAAATAAGTTTA E=FFFFAAAA9DD?AD>A>@CE-CFFGDFDGADEGFBAFDGGGGFGGGF=GGGGGFGAFDGEEGDGFGFDFGGGFGGFGEGGGGGGEGDGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:74C16 foo_477/2 163 chr10 12107819 60 91M = 12108214 486 AGACAGGGTCTCTCTCTGTCGCCCAGGCTAGAGTGCAGTGGTGCGATCATGGCTTACTGCAGTCTCGACTTTCCAGGCTCAACTGATCCTC DDADD=CC?CFFFFFFFAAFEEEEEGGGCGGFCDGBFBDFECEEEEFBEEEE?DECE:BD?A:688:<.E XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_479/1 99 chr8 21158099 60 91M = 21158474 466 AAATGAGATGCATGTGTTTAATTTCCCATTGTTACCCGCTAGTTCTTGCTTTCATAATTAAAAGAAAATTGCTTATAAGGAAATTTGCCTG GGGGGEGGGGGGGGGGGGGGGGFGGGGEGGGGGFGGFGFGFDEFFEC?CCEECEEFFFGAEEED+7:;3C?EEECFFFEDFFEEFFEGEGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_479/2 147 chr8 21158474 60 91M = 21158099 -466 CAGACCTGGGACGCTTTTCGTATGCCATTACCAAAATCCTTCATGAGTTGATATTATCTTTAACACACAGAGGGAAAACTAAGTTCTAAGG EEFDBFEGGGEFFDFFDGFGGGGGGGEGGFGGGGGFGGGGGGGGGGG?GGGDGGEGDGGFGFFFFFGGGGGGGGGGGGEBGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_481/1 99 chr6 49209149 60 91M = 49209520 462 TTTGAATAAAACTTTATTTACAAAAAGAGGCAGTGGGCTGGATTTGGCTAAAATCCAAAAATTTGAGTTTTCTTGGATAAAGAAAATGTGG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDGGGEFGGEGGGGGGBGFEGBFGEGEGEEFGEEGEEGFEEC8GEFFFFEEGGGEAEEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_481/2 147 chr6 49209520 60 91M = 49209149 -462 TGAAATAATCTGTACACTAAATCCCCAGGACATGCAATTTACCCATATAACAAACATGCACATGTATCCTCTTGAATGTAAAACAAAATTT 4EGGBDFEDFEGCDFDGDGGDEDGFFGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGDGGFGGGGGGGGGGGGGGGGGGGGGGGGFFGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_483/1 99 chr16 63481174 60 91M = 63481544 461 TAATTGTTTTCGTGCCACTGGTCCTTTGCAAGGCTAAAAATAATGCCCTCCAAATTAAGTGGACACCACAAAGCTTCATTTTCACTGTCCT EEEE=DBDD>7=C?CC:CC?C>C:AAAA?C:BDDCB5DC?EEE=CC>CA5<;:>;7<:C<1&746DDCBDEAEABDDAA:? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_483/2 147 chr16 63481544 60 91M = 63481174 -461 TGCTGAATGAATGAACACCCATCCATTCATTCATTCATGTATCATCTATTTACTCTTTTAATGTGCATTTCTGGAAGATTGTTTTATTGTG :9<=BC?A=?A.2:*61C=CE=C=EED:BBE?ACD@C:::DECE5CDB@BAAC9CCGGEGAFFDBEEEE?EDAAD=EA;EAA@CC=>C@@C XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_485/1 83 chr9 125074494 60 91M = 125074118 -467 ATCTGATACTTCTTTTTTTTTAGACAGAGTTTTGTTCTGTCACCCAGGCTGGAGTGCAGTGGCGCAATCTCGGCTCACTGAAACCTCTGTC ############?@@B>@>>:2825/49(0=:--7-8;<)90BAA@FBB?FGDGFE?EEEEGDEFGFCFEFGDGFGEFFFFGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_485/2 163 chr9 125074118 60 91M = 125074494 467 GAGGCCAAGGTGGGCAGATCACGAGGTCAGGAGATCGAGACCATCCTGACCACCATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATT GGGGGGGGGGGGGGGGGEGGGGGGGGEGGGGFGBGGGGGGGGGGGGGFEGGGGGGGGFFGBFFDGGGGEGEEGGBGGFF=EFCEEDFGF=E XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_487/1 83 chr5 42213812 60 91M = 42213434 -469 AAATGCCACCAGTCACCTTGCTAAAGCACAGCAAGAGTCAACTTTGCTCCAGTTCCCGACAAGTTCCTCGTCTTCCTCAGAGACCATCTCA C76527D:DDDDDABDCCA=C:=ACC@>>@B-C=CCD:DDDDDB-BCCA=CD==?D@CB??==>C?9:=7=BDD:C=77;:CDA?DB<=B5 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_487/2 163 chr5 42213434 60 91M = 42213812 469 AGGCACCAAGTCCCTAGGCTGTACACAACAAGGGAGCCCTGGCCCTGGCCCAAGAAATCATTTTTCTCTCCTAGGCCTATGGCCCTGTGAT D?E=DD=:D=@6@@@?:ACA:=9>BDDEEEFF?=-;B===?:AACC=?=CC5CCCEBE=EAD?BDDACAEGEFDGBD?A534::80618+A XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_489/1 83 chr8 26167640 60 91M = 26167273 -458 CCAGCACTTTGGGAGGCCAAGGCAGGTGGATCACCTGAGGTCAGGAGTTTGAGAGCAGCCTGGCCAACATGGTGAAAACTTGTCTCCACTA E?B5EB?BEAFFDFFEB=BBE=B=FBBCBB?A?BAACC?CBAAA-AAEE?EDAGGGCCCC:GGFGFGDGGGGEGGGFF=FFGGGGDFFEEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_489/2 163 chr8 26167273 60 37M4I50M = 26167640 458 TCCAGCCTGGGCAACAAGAGCGAAACTAAGTCTCAAAAAATAAATAAATAAATAAATAAATAAATAAATAAATAAATAAATAAAATAGGTC GEGDGDD?FFEDGGGFEDGGGFAGFEEDEACD5CDEEEE?A>@@>EECAEA@8EE@CFF@=>=BD=D?DDBC:D38?B=?BDCDD?CDCBDDB?CCBBBBABBBBB:-DBABD=CBBDACD:B## XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:35^A56 foo_492/1 99 chr13 96453707 60 91M = 96454068 452 ATTTTGTTATTTTATTACATATTTATTTGAAAAGAAAATATTCTCACTCTGTTCTTTCATGACTTCTTTACATTAACCTGAAATTTTTTCA GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGFGGFGGGGGGGGGGGGGGGFGGGGGGGDGFGGFDFGGFGGGEGGGGGGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_492/2 147 chr13 96454068 60 91M = 96453707 -452 ACTTAACACAGTTCTTTCCACCATAGTCCTATGCACTTATTTAGACCTTGAGCAGGGCCAGCTATATAATTCACAGGTCTTAGTGTTGAAA DGFEGFGEGGGFFGGFGGGGGGGGGGGGGGGGGGGGFFFFEFFFFFGGGGGGGFGGGGGGFGGGGFGGGFGGGGFGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_494/1 99 chr1 187310324 60 91M = 187310672 439 GACCTTTTTGGCCTTATTATGCATTTTCTTGCTTAGATTTAATTTAACTTAGATTTCTACTTTGTGCACAATAGCTACTTAGTTCCAGGCA GGGGGGGGGGGGGGGGGGDGGGEGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGEEGGGFGEDGEFGFFDFFEGGEGFFGFAGDEEE? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_494/2 147 chr1 187310672 60 91M = 187310324 -439 TCCCAAAATTCTAGCTTGTATAGCTGCTAAGGAAGAGGTTCAAAACTTCAGTTTTTATACATGGGTCTGAAATTTAAGAAATAATAACTCA ?EEBEEGG=FEGGGGGGGGGGGFGGGEGEGGGGGGGGGGFDGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_496/1 81 chr11 73965428 37 91M * 0 0 GGAGACAGAAAGACCAGTGGGGCGGCTGAAAGATGGCCGGGCCTGTGCTCAGCGAGGGGCAGTAAGGATGGGAGTAGACTTGGCACCTGCT B?=9BB698:.=37::4'/6%3)8332-9>)@==@@<)67:C=2CC@ACA@?A1C=B?;B+?AAC@=?:CCCC5CC66680748<1540=? XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:22A68 foo_497/1 81 chr16 30480960 37 91M * 0 0 AATATCGATCCGGTGTTGCTGACCTAGTTTGTTCCTGCTGGAAAAGTCTTCTCTTGCTGTTCCCCCCAGGCGGCCTTGGCAGCGCCTCTTG @@>=4C;=-B7=;*?;>?;=:5=BC?AC?BCD?BBE<7=975A?CA=-?1:>>C=.@?@@@=C4>;C@?-;?DDBDDA=DEEA=EEEEEDD XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:63T27 foo_498/1 65 chr1 206890991 37 91M * 0 0 CCTTAGGAAAATAGAGAGGGCTTTCTCTTATAAATATATTTTTTAAACAAGTAACATCTAGTAAAAGCAAACCCATCCTTCGCTTGCTATG >0776*65.(04+:0;<9));:989;(=?3;/:.=);B;7BD8>8BBC?B0>AA-6.>):7)962A?AA??0?@=@5C@@>55>?=C@>@D XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:58C2A29 foo_499/1 83 chr17 38911344 60 91M = 38910967 -468 TACTGGGGAGAAATGGAGATCCTGGAGGAGCAGTTTCTTCTGTGTTAGTGGCAAGCCTGTGGCTTAAGACAGAGCTTTGGCCCTCGGCTCA ######E4CAEC>=A@>?:=@?9BA@BAA:ABBCCC?CBABBAACBAEBE??GGGGGFGGGGGGGGGGGGGGGGGGGGDGGGGGGGGFFGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:4T86 ================================================ FILE: examples/pydoop_script/data/base_histogram_input/example_2.sam ================================================ foo_499/2 163 chr17 38910967 60 91M = 38911344 468 TATTGAACCAGGCAGGGGAACCTGGGCCCCTGAACTCTGTCTCTTTATACTGCATTTTGAAAGCAGCACTTGGCTCTCTAATTGCCCCATA GGGGGGGGGGGGGGGGGGEGGGEGDG?GGGBGFGGGFGGEGGFFGGFGDGGGGGDGDGGFFGGEEGGGGEC5EEEGGGEFEADDEEFGBEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_501/1 83 chr14 44191907 60 91M = 44191517 -481 CAATCAAGAAATTAGAAATATATGTTTGAATTATTTACCCACTACTTAGTAGATTTTTGAACAAAACCTTTTTCTTTTCAATTTTTAATTG EFGEBGGGGEGGFGGGEGGGGGEGEEBEGGFEGFGGEFGFDGDEDGGGGGGGGGFGFGGGGDGGGGGGGFGFGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_501/2 163 chr14 44191517 60 91M = 44191907 481 CTGACCACTGAGTCAGGAAACACAAATACAGAATAAATACTTTATTGGCAACACATTATAATATACATTAAAATAATAGTATCTGTATTTC GGGEGFGGGFGGGGGGEGGFDDEEGGFFGGBFFFFGFGAGGGGGFEGFGGFGEGGGGGGGGGGGGGGGGEFGGGGGGFEFGEG=GEFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_503/1 83 chr10 28470051 60 91M = 28469666 -476 TTATGCTAACAACTAAAAAAGTTTCTATGATCACTTTGATAAGTACCATAAAGTCACTTGACAAAATTTAACATAATTCTGATTTATTTTT GGGGAFGFGGGCEEBEE:FEFGFGFGEFGGEGEGFGGFGGGGGGEGGGGFGGGGGEGGGFGFFGGGGGGGGGGGGGGGGGGGGGFGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_503/2 163 chr10 28469666 60 91M = 28470051 476 AAAAATAAATAATGAAGTAATTCTGGAAGAGCTTGAAAGCTGGATTAAATCAATAACTACAGAAGAAATTAAAAGCACTATCAAAAAACTT GGGFGGGGGGGGGGGGGGFFGGGGGGGGGGGGGGGGGGFGFGFGGGGGEGGGEGGGGGDGGGGGGGGGGGGGGGGFGGGGBGFGGGFFEFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_505/1 99 chr6 52691824 60 91M = 52692215 482 GTGGAAAATAAAGCTTTGTAAAATAATCGGAGATAGCTAGATAAGCAAACAAGTTAGTTTATTTAAACTGTATTGAAAAAAAATTAAGCAA GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGFFGGGGFFEGEGFGEGFFFEEGFEGGEGFFFFGGGE;GGGBG5F XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_505/2 147 chr6 52692215 60 91M = 52691824 -482 GTGAGTTTAAGGGGGATGCTCTTGGAGTATATTTAAATTGTTCTGTTAAAATGTTTCGATTGACATTTACCTTCTCTGTGAGCTGTATCTG GGGCGGGGBFGGGGGGGGGGGGGFGGGGGGGGGGGEGGGGGGGGGGGEGFGGGGGGGGGGGGGFGGGGFGGGFGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_507/1 99 chr3 142788081 60 91M = 142788459 469 ACTACTATTCATGTCATTTAAAGTTAAAGGATACTTCTTTGTTTTGGATTAACTTTTAATTTTTATAGCTAAATGTTTACATCTGTTATGT GGGGGGGGFGGFGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGDGGFGGFGGGGEGDGGGEFEGGGDDGFGGGGGAAFFDFFGGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_507/2 147 chr3 142788459 60 91M = 142788081 -469 TTATGTTTTTCTCACATTAAATTTCTTGAGTTTCTGAAAATGGTGTTCCTTCAGTGTGCTCTGTTTTCTAAAAATCCATAGTAATCCATAC GAGGGFGEGEGGGEGGGGGGGFGGGGGGGGGGGGDGGFGGGGGEGGGGGGGGGGGGGGDDGGGGGGGFGGGGFG=GGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_509/1 99 chr6 3270997 60 91M = 3271365 459 GAAATCTATATTAGCCACGGTAAATTCAGTGTTGGAAGCTCAGCATGGGCTAGTATCAGATAAGGAAATTCAGCACTTGGCCATCCAGGTA GGGGGGGGGGGGGGGGGGGGEGGGGGGGGFGGGGGGFGGGGGGFEFGGGEFFFEFEEBECE@FEEDEEDDECCC?CBC@CEFEEDEAEECC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_509/2 147 chr6 3271365 60 91M = 3270997 -459 AAAATGCTCCCTTGCTGGACCAGTCTGGCTTCAGTCTCCTCTTGCTACAAGTGAGTTGCTGTGGCTGGCACATCTCTTAGTGTGACTGTCA GFEEDEFBGGFEGGFGF?EFGFGEGFGGGGGFGGGFGGGEGGGGGGGGGGGGGGGEGFGGGDGGGGGGGGGGGGGGGGGGGGGGGGFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_511/1 99 chr3 142226380 60 91M = 142226747 458 GAAACTTGTTTTGTGGCTCATTATATAGTCTATGTTGCTGAACATACCTTATGGGCTTGGAAAAATAATATGTATTTTTTCTTATTACAGA GGGGGGGGGGGGGGGGGGGGGGEGDGGGEFGGGFFGGGGFFGGFEFGGFGEFFGGBBC=C=CCCCA;??;<>EEE@A;?C?CE:BCEC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_511/2 147 chr3 142226747 60 91M = 142226380 -458 GCTTTCTTAATCTTAGTGTTTACCTGCTGTATTTTTAAAAATCTATTTTCTTTCAGTCGCTCTGGATTTTTATTTTTAAAGTGCATCTCTT DFGEBGGFGGFGGFGFGFGFFEGGFGGGGGGFGGGFGGGGGFGGGGGGGGGFGGGGFGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_513/1 99 chr4 108147713 60 91M = 108148076 454 TTATAGAAAGTTTACTTTTTTAGTTTAGTGATTTTTTTCCTCCAGTGGGAGATCTTGAATTTATAAAGCATAGGTGATTTGTTTGTGATGA GGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGFGGGGEGGGEGEBGGGGDFFGGGGEECEFEECEE>EBEFEGEGGEEBFFFE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_513/2 147 chr4 108148076 60 91M = 108147713 -454 TTTTAAAATAAATATATTTTGAATAAAAACACTACAAAATAAGAAATGATTTGAAAAGAAAGCAAATATTTTGGGTGTGATGGTGCTACAG GAGGGGGGGGGGGGGGFGGFGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_515/1 83 chr8 1835390 60 91M = 1835014 -467 GATGAATGAGCTTCCTCGTGTGAGGGAGTCTGGCGCTTGGGAGGAAGGGGGTCAGGCCGCCTGTTAGACATTAGTTATTCAGCCACAGCTG DABADC5AC<6AA->CCCGFGAGAFBEFE:EEAEDFADE-DAEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_517/1 99 chr1 121186790 54 91M = 121186832 133 CATAAACTCCTTTGTGATGTGTGCGTTCATCTCACAGAGTTTAACCTTTCTTTTCATAGAGCAGTTAGGAAACACTTTGTTTGTAAAGTCT FGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGFGFGGGGGGGGGGGGGGGGGGGBGGGGEFFFFCGEEDDGGFGGGGGGGGEFEGD XT:A:U NM:i:3 SM:i:37 AM:i:17 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:2G26A46C14 foo_517/2 147 chr1 121186832 54 91M = 121186790 -133 AACTTTTCTTTTCATAGAGCAGTTAGGAAACACTCTGTTTGTAAAATCTGCAAGTGGATATTCAGACCTCTTTGAGGCCTTCGTTGGAAAC CE>4>@EDDFGEGFFGGDGFGGGFEGFFFEFCEB=EGGGGGGGGGBGGGGGGGGGGGGGGGGGGGBGGGGGGGGGGGGGFGGGGGGGGGGG XT:A:U NM:i:3 SM:i:17 AM:i:17 X0:i:1 X1:i:4 XM:i:3 XO:i:0 XG:i:0 MD:Z:3C41G24C20 foo_519/1 99 chr9 84058926 60 91M = 84059308 473 AAAAATTATTTTCATGTGTTAATCATATCTTTCAGGCAAGATAACTTAGAAATTATTCATCCTTTATAACTACTAGCAAACTTAGTGTGGT GGGGGGGGGGGGGGGGFGGGGGFGGGGGGGGGGGGGGGGGDDGFFGGGGGGGGGDGFGGGEEGGGGGGDGGGGGGEFGGGGFEGGDGGGGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_519/2 147 chr9 84059308 60 91M = 84058926 -473 ATTACTACAGTTGTTATCTTTATAATTAAAGTAGACTGACTACAGCATGAATGAATTAGGTTAGTTATAATGAAGATGAAAATATTTTTAT GGGEFGGGGGGFGGGDGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_521/1 99 chr1 206206389 60 91M = 206206755 457 GGACAGGCAAGAAGACTATACTATTCCTGCCTCCTATTTGCTCCCTCTTGCCAGTGGGGAAATCAGTGGCAACCTTAGAGTAATCTTGGAG GGGGGGGGGGGGGGFGGGGGGFGEGGGGGFGGGGGFGGEGGGGGGFGGFFFF?FCEEEECEE:EEEEEEECCBCBDBB?CA=CACBCBB5E XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_521/2 147 chr1 206206755 60 91M = 206206389 -457 TAAATATATAATTGGAGGAAGAAATGTGTCAGGAAATAACTTCTTTACTTGTGGAAAGTCATTGAAAACCCTTGGTTTTGCGTATGAGCAC DGFGFFGGFGFGGGGGGGGGGGGGGGGGFGGGGGGGGFGGFGGGFGFGGGGGGGGGGGGBGGGGGGGEGGGGGGGGGGGGGGGGGGGGGFG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:32A58 foo_523/1 99 chr9 91731195 60 91M = 91731593 489 TAAGGGGCTGATTCTGGCACCCAGCCCAGCTGTCCTCATAAGCCAGGTGGTCTGCATGCTTGTCCTAGATAGTTCTAGACTGACAAAGAGC FFFFDDGGG?GEDFGDDFDGFGEFEGGFGD?DGEDEFGGDDGFGFBE:=EBEEEDFFDEBGGDBGGBFEGFB:FF?CCDDFF=AFFFEEF# XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_523/2 147 chr9 91731593 60 91M = 91731195 -489 CGCCCCAGCCTGTGGAATGCTGTCCAACCTCTTTCACTTAGGAGAGACAGGACAAGCTGTAAGCGTTCATATGCCAGCCTGGAGTATGGGG 5>5A???-DCC??A>A44;;;0*547(==1;FEEFDEEEEDBFDFDF?BFFD?DGBEDFGGDGGGGADFFF=?AEGGFEGGGFGDGGGGFG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:2G88 foo_525/1 83 chr2 105643826 60 91M = 105643428 -489 TTTAAACTTACATTTCCCCTTTCCATGGACTTCCTGTCCATTCATGCATCTTCTTTGGTGAAGTATCTGTTCAAATCTTTTGCTCATTTTT EBA?>-AAA-?:>;7?';?::@BB@BE?A=EB=EBBE:FEFCEA?DCBEA>FFGFGEEAEEF=FDFAFFDF?FEDGEGGGGEFGDDGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:16A74 foo_525/2 163 chr2 105643428 60 91M = 105643826 489 GTCATCGTTGTTATTGCATGAATCACTAATTCATTCCTCAATATTGCTGAGTGTGAGTGCACAACAGCTTGCTTGTCCATTCGTCATCACT EEE:EFFFFFGAGGGAFDEFDFFFFFGGEGGBGGGFEDGGFBDFFGDDGDD5D?DDD@DADDAEEDBGGEGCGG?EDD:CEBEBC:C:CDD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_527/1 99 chr1 23303761 60 91M = 23304138 468 TACAGTGACAGGGGCAAGGTAGGAACCTTATAAATGTTTTGTAAATAAATTAATGTAATAGTTTTATTGTATTCTGTACTGGCCAAATTTA GGGGGEGGGGDGGGFFFFFAFFFDFGGFGGGGGGGGEGGGGEGGFGGGGGGGFGGEGGGGGEGGEFGEGEGGEGEGFGGGEBGGFGDEGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_527/2 147 chr1 23304138 60 91M = 23303761 -468 CAACCTGTATTGGACAAAGGGATTAAGTAAGAGAGTAACACAATCAGAACTGCAGTTTTTGAAAGGCCATTCTAGCAGGATAAACTAGGGT GFEFBDGFGGGEFFFDGGGFGDFGFGGFFGGGGGGGGGGGDGGGGFGGGGFGGGGFGGFGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_529/1 99 chr11 130722616 60 91M = 130722997 472 TGTAAGTGGTAAGTACCATGATGGGAAGAAGCCGGGGATCTGTAGAGATGCAAGGGCAGTATGTCTGGCTCAGCCTAAGGAAAGCAGGGCA GFDGGFFFFBFFFFFGGGGGDDGGFGGGGGGGFGGGGGGGGFAGDFAFFFEGGFGGGFGBFGGEFFFF-?B?==@:;B?A6>@>>?5BC## XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_529/2 147 chr11 130722997 60 91M = 130722616 -472 TGTTAGAAGTGCACTGGGATCCTAGTAAGTCACTGAGAAGTGAGATGCTCCTTTTGAAGACTTCATCTTTAACCAGTGCTCAATATTTTGG GFEBDDG?FGG?EGFFAFAEGGGGGDGEGGGEGDGGGFFGGGGGGGGGGFGFGGEFGEGDGGGGGGGGGGGEGGGGGGGDGGGGGGEGGGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_531/2 145 chr7 145234 37 91M * 0 0 CCATGACGACACCGGGAGTTCCCGCTCCTTTCCATGGCAATGACTCAATGTCCCAAAAGTTACTATGCCTTCCTTAGAAATTTCTGCATAA ################B::;(D=CDD?C:CC::C=AB.?B=5>50+;;A<(:BB7:968872)355:73?:97:62?=665/4:;: XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:20A29A40 foo_532/1 99 chr5 27195861 60 91M = 27196237 467 GCCAACGAAATCCTGAACAAAAAGAACAAAGCTGGAAGTTTCACATTACCTGACTTCAAATTATGCTACAAAATCACAGTTAAAAAAAATA GGGGGGGGGGGGGGGGGGGGGGGGEGFGGGGGGGGGGGEGGFGGGGGGGGGGGGGGGGDGEGGFGFFEGGGGGGGGGGGGGGGGGGGEGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_532/2 147 chr5 27196237 60 91M = 27195861 -467 CACTCAAAGACATTGTTCATGGCAAAGATTTCTTGAGTAGAAGCTCAAAAGCACAGGCAACAAAAGCAAAAATGAACAAATGCAATCTCAA ?=CAC=GEGFGGGGFEGGG:GGFGEGBG=GGGEFGFGFGGGGGGEGFFFFFGBFFGG@GGDGGGGFGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_534/1 83 chr11 66280042 60 91M = 66279654 -479 TTAAAACTATCTTTTGGGGGACATAGTTCAACCCACAACAGACTAGCAAATTTGATATCTGAGATGATTCATCTATCAGAAGCCAAGAGAT C>CB?5AB??CDECBEEEEBECBBB?CAADABECBDGDFFFGGDGGAGGGGFGGGGGGGGGGFFDFFGEGEDFEDFEGGGGGGGGFGGEGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_534/2 163 chr11 66279654 60 91M = 66280042 479 AAATCTCTGCTCTCTAGGGCTGCTGTAACAAAGTACCACAAACTGGATGGCTTGAAACAAAATATATTTATTTTCTCACAGTTCTGGAGGC GGGFEGGGGGGGGGGGGGGGGGFGDFGGGGGGGGGGG=GGGGGGGGGFGGEDGDFFFFDDGGEGEGDGGFDGGGGGBGE5CBECCAD=ADC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_536/1 83 chr12 30789694 60 91M = 30789324 -461 AGAGGCTGCAAGAGCCATATAGAACATTACAAGTTTTATTTACATTATTATAAGTACCTGTTGGGTTAAAGAAGAAAATCAAACTAAAATT GFFFEFC?GGFGGGGGGGFGGFFEFFBDGFGGGGEGFGGDGGFGFGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGFGGGGFGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_536/2 163 chr12 30789324 60 91M = 30789694 461 AGATGTATTAAATGTAACAAGGGATCACAGTTCAGCAGCAAAAACAGGTACCACATTCTAATTCTTTAACAACAAAAACTTTGGTAGAACC GGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGFGFGGGFGGGG?GGGGGGGGDGGGGGGGGGD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_538/1 99 chr18 14052702 60 91M = 14053081 470 CAACAATGGTGGTGCTCATGCTAAAGTCTAAAAAATACTTGTTTCACACACATTCCCCACACAGACAAACACCCACTGAACACATAAAACA DBDDDDD-BD*7)77DDDDDDDBBDC5C?CC?CCC>@A=?@=6=B9@@>C85?9)>@6>>B:?DDE@@EE4DFEDA=?DC?AD-D:ECBC==B8=624(5-=CACD=DAD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_540/1 83 chr5 16736316 60 91M = 16735955 -452 TCTTACTGAGGGCAGAAGACAATTTTGATTCTTTCTCTGGTCCGTCCTAGAGAAACTTTTCTTAAATTCTGCTGCAGGCCAGAAGTAGAAC EEEDAFGEEGGGFFGGEEFFGGEGDEFGFEGGGFGBGFGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_540/2 163 chr5 16735955 60 91M = 16736316 452 CAGTTGAGGAAGAGAGAAAACCCTGAATGAGGATGCAAAATTCTAAGGATGGATCATCACATGCCCAGCCTGGAGGAGTGAAGGGCTTGGG GGGFGGGGGGGGFGGGFGGGGGGGGGGGGGGFEFGGGGGGFGGEGGFGGGGGGGGEEGGGGFGFGGGGGGDFGEGEAEAEABCCC=BBBCG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_542/1 99 chr16 1641294 60 91M = 1641669 466 CACACACAGGTTATGGAGAGGTCGACTGAGAGGTCACACAGAGGTCATCTAGGGGTGGATTTAGGTCACGCACAGGTCACGGAAAGGTGGA GGGGGGGGGDDGFGDFFAFFFBFF?EEEEEDEAAFGGFGEE=EEBFBFEFDEAED=@>-=BC?=CAEABE?BBBBE@ECEBDBDA@B?BB4 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_542/2 147 chr16 1641669 60 91M = 1641294 -466 AGAGGTGGACTGAGAGGTCACGGAGAGGTGGACTGAGGTCACACAGAGGTCACCTAGAGGTGGATTGAGGTCACACAGAGGTCATAGAGAG 4?C=@5FFGGADGGGGGG=EGGFGGGFDEGFFGGGGFFEGGGGGFGEEFEEEEGGGGGGGFGGFFGGGGGEGGGGGGGGGFGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_544/1 83 chr7 61606857 60 91M = 61606477 -471 TTGCTTTAGAGAGAGCAGATTTGAAACACTCTTGCTGTGGCATTTTCAGGTGGAGATTTCAAACGATTTGAGGACAATTGCAGAAAAGGAA DGFFGFEFGGGGGGGGEGEEEDEFECFAFEFEDEFGGFGGGGGFEGGGGFGGGGGGFGGGGGFGGGGGGGFGGGGGGGGGGGGGGFGFFGG XT:A:U NM:i:1 SM:i:37 AM:i:23 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:62G28 foo_544/2 163 chr7 61606477 60 91M = 61606857 471 TCTTTGTGCTGTGTGTATTCAACTCACAGAGTGGAACGTCCCTTTGCACAGAGCAGATTTGAAACACTCTTTTTGTGGAGTTTGCAAGTGG FGGFGGGGGDFBFFFDGGGGGGGGFBGGGFFBFFFFFFFFFFGGGGGGGGGGFEGGDBGFGGEFGGDGDGGGGGABBEA>ACABBADDD=FFCGFGFGFFGDGBGFDGGGGEEGFGGGGGGGGGGGFGFBDFFGGGGDEEEEEFFGGGGGGGGDGFGGGFGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_549/2 163 chr18 67459370 60 91M = 67459745 466 ATTCTCACTTTGGGCTGCGGGTTTCAGGCTTTTTGCCTTGAAGGTAGGGTTTTGCCTGAGTTCCATTCCTATCTGCCTAGAGTTTCTCTGT GGGGGGGGGEEGBGGGGGGGGFG?GGGGEGGGDGGGGFGABBDD=DDDDDDEFFFGGGFDGGDGGFGDGGGGGEFBFDFF:ECDFF?F?FG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_551/1 99 chr17 15311771 60 91M = 15312143 463 CGGTCACCAGCAATGCAGTCCTCAAACACGGACATTAGAGTGATTATTCCCACCCCCAAGAGTGAACACCTACTATGTGCTGGACCCTAGA GGGGGGGGGGFAFFFFFFFFGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGEGGGGGGGGFGEGEFGDFGEFGFFGGGGGEBGEGGGEFF? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_551/2 147 chr17 15312143 60 91M = 15311771 -463 GCCTTCTTCTGAAATAAAATCCTCTTTAGATAAGCTGGAGAGAGAATGGCTGTCACTAACAGGCTGTTGAGAATTAGCCTCCTTCTCCCCA CECC:CB=CC:GGEEFFFD?FEFEGGEGFFFE?EF=FFBFEFGGGFGFBGGGGGEGFGGGEGGFGDGEGEGGGGGGGGGCGGGFGBGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_553/1 99 chr21 46880953 60 91M = 46881316 454 GCCTTAAGTCGACACCTGATCTAACAGAAACTAACAGGCTTCAGCAAATGAACAAGATTAACACCTTGCAGACTAGTGAAGAAAACACACT GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGFGGGGGGGEGFGGGGGEGFDGEFDFFBEGCCFEFFD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_553/2 147 chr21 46881316 60 91M = 46880953 -454 AGAAGTGAATCGCAGGTAATTTCCGTTCCACTTCCTAAAAATCTGTGTTTAATGAATGTGAGCATTACAGAGAGCTAAGCGTCAGCTCACA BGGGGGGGGEGGGGGGGGGGGGGFGGGGGGGGGGFGGGGGGFGGGGGGEGGGGGGGGGGGGGEGGGGFDGGGGGGDGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_555/1 83 chr21 19675384 60 91M = 19675012 -463 ATGATTTTAAAGCAAATAAAAAGGTGGACATATTATCCTGGAGGAAAAGAAACAGTACTTGATGTGAGCTATTTTATTGTTGCCATGTTCT F?FBGEDBGGGECEACCGFGEEDBFFCCEBEFEBF?FFDFFGAGGGGGFGGGGDGGEGGGGGDGEGGGGGGGGGGGGGGGFGGGGGGGFGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_555/2 163 chr21 19675012 60 91M = 19675384 463 ACCATCTGATGCTTTTATGTGACATATATTTATGTTACACATTAAAAGCTTTGTATATTAACATATCTTCTTATCTGACGTAAGTTACAGT GGDGFFFFFDGFGGGGEFFFEGGDBFDGGGGDGGGGGGGFFFEGGGGGGGGGGGDGGGFDGFGGFG=DGGGGFGDE?EEFDFDFCF=DEFE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_557/1 83 chr4 98523834 60 91M = 98523445 -480 CCTGCACCAAGGGCTTCATGGGGAGTTCCCTATGATCAGTTGACAGAGGAAGAGAAGACTAGGGCCTGGTTCACAGATGGTTCCACATGAT B@>@C=:EEBFEFFF?=EDED@DDEE@BB?A5?DCC::E:EAF?GFFFF=DEAFEFFFABFFDEGGGGGGFGFGGGG:EGGGGGGGGGEGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_557/2 163 chr4 98523445 60 91M = 98523834 480 ATGCGGCTTGGAGCCTTTGACAGGCCCCCATAGGTGAAACACAATGGAGGGCTCTAGGGTCTTGGGACAAGGCCCTGCCATCTTCTGCAGA GFFGFDGGG?GGGEGFFFDEECEDDGGGGE=@5D@B>:4>?@CAADD-BDC;CA@AACC-<7=3B########################## XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_559/1 83 chr17 27857630 60 91M = 27857610 -111 AGTAGAAGGGAGGGTATAAAACAAATCTGATCCATCAGATAGAAGGCAGGAAAGGGGAACAAAAGTAGCAAAGAAGAAACACGGGAACAAG @81CCA@>CCCC-C?A8@?7@;?<:3-2:2AC5CB?CACBC:@;:97>3@C@B>;>5A@;@;A?1;6CCCCB=@=@?-C@>=>=:BCC?CC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_559/2 163 chr17 27857610 60 91M = 27857630 111 TGATGTACAACTCTCAGAGCAGTAGAAGGGAGGGTATAAAACAAATCTGATCCATCAGATAGAAGGCAGGAAAGGGGAACAAAAGTAGCAA DEBD:EDDBEFFBFE==9;BDDDDCFFF=55AA39:>@=?@.@:A2?C-=EEA5A@@6>6C=:CC:DABD:A5A@################ XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_561/1 83 chr7 109887272 60 91M = 109886912 -451 ATTTTGGTGTGTTTATTGGCCATTTGGTTTTTTGTGAATTGATCACATTTCTAGTGGTTTGCTTCTCTTGTCTGCCTTGTCTTTATGTTGT CC=CC=:AFDFDDDDD>@6B@5A=EBACCAFEE5B:FFDFFFDEEFFDFFFFEFEFFFFFFFDDFFFFFFDFDFDFEEEBEEEEDDDEEDE XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:19T71 foo_561/2 163 chr7 109886912 60 91M = 109887272 451 TAATTCATTTGTTCATTTAAATAACTAAAATAAATCATGGTTTGGATATTTTATAATTTATTTAGGAATTGCCCTAGCACTGAGCAACATA GG?EEGAGGEGGFFGGDGGDFGGDGF=FFDBFC@CD5:D5?ACCC=D=DBC;ACAEEDD?EE8ACD-DD?@#################### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:66C24 foo_563/1 99 chr16 76387682 60 91M = 76388052 461 AAGGGAATGGGGACTGATTATTGCACCCACCCTACCATCCATCTGCAGCATCTTTGCAGTGACAAAGCCTACTATGTACCTCAGCTTTTCA GGGGGGGGGGGGFGGGGGGDGEGFGEGGGGGFCGGFGGGGEEG?BGDGGGE=GGFFEBFDEA?:=CA?CACBBB:D?BDBFFFE=CFBFC: XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_563/2 147 chr16 76388052 60 91M = 76387682 -461 GGCACTCAATAAAGGTTGTTTCTTCCTAAATGACTTGGGCTTTTACATACAGAACTAGGACATGAATTGCCATCTCTTGTTTTCTTGCCTA #@>>@>6CEEEDD-BA;CEBE5EDE?>C@=;D?DDBB5EEEEEEADED:EEEEDEE:DAEF=FFFAB?GGGDGGGGFDDGGFDGGGGGGDD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_565/1 99 chr13 34433741 60 91M = 34434104 454 ATGTAAAGCTAGGCACTTCGGATACAATGATCAAGATAAGGTCTCTGCCCTGGAGTTTATAGAGTTGTAGAGGGGGACAAATATGGGCAAT EEECEDFFDFGGGGFGGFGGFEEGFGGFBDF?FFFGGEGFBEEE?FEBFDEEE@CBEBBD2>>A5<=73;07=1'.*;30,7<=D XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:74T16 foo_565/2 147 chr13 34434104 60 91M = 34433741 -454 ATCTCAATAATTTCATAATATTTACTACATTTTGAAATGATATTTTGGATATTTGGGTATATGTACTGAGTTAATACAAGTTAATTTCACC 24::2>5ADD?EAE=EC?=CCAEEEACEEAEE=EE?BGGGGFFDFFD=ADDDBA:EDD?ADE:EEE:EAEEEFEEFEEE:DFFFFFEEEEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_567/1 99 chr10 24568194 60 91M = 24568578 475 TCCGTTCCAGACAAAAACTTCCAGAAGAATTAGATTTTCTGCTTTAACTGCCGCCTCACCTGTGGAGCTCTGGGTACAGGTAAATATGCGG FFFFFGFGGFGGGFGGGGGGDFGGGEGDGGGGG?GGGGGGAGGGGDGGGFGEDGBGEFGGGGBGG5B@BBBBBECB@:DB@B?BAEEE?EF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_567/2 147 chr10 24568578 60 91M = 24568194 -475 ATGCTCCATGTATTTCTTATTTATTCCCAGATGTGTGTGTGTGTGTGTGTGTGTTTGTGTGTGTGTGTGTGTGTGTTCTTTTACTCTTACG ###########################C=5DD:EDE-DFFEEEDEDDDB5BE4AAEFDFDFA?EEEB5:DEF=F:EGFFGDFFFDDGBGGD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_569/2 145 chr11 112652768 37 91M * 0 0 GGGGGGGGAGAACGGAAAAGCCAGCCCTTTGTATCGAAATTTTGCTTTTTTTTCCCTCATTCTACTTTAGAACTGCAAGCTTGTGCACTGT ########################B=-;?;;?=A-CCBBAD-CA<@14:=-=6CC?AA-AA71,7.,27-,---?AA:A?A=7772+.13: XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:12G21A56 foo_570/1 99 chr11 125184448 60 91M = 125184829 472 ATACTGTATTTTTGCAGTCCACTTACATTCAACTTTGGTATCCTTATACTGTGTCTTATGTAAACAGTATATGCTGTTTTTAAAGTTTTAG EGFEFGGFGDFGGGFGGGDDFFFFFEDEFFGEBGGFGGFFGGEDGD:EEEEBEBDDFFFFBEBABED@CCB@B5BFFFFFGG=G=ABDDDD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_570/2 147 chr11 125184829 60 91M = 125184448 -472 AGGACCCCAAAATATCTTAACTCCATTTATTTTTCCAAGTAATATTATATTATTGTCATGTATTTTAATTTTATATACATTTCAAACCCCA BDAADADFF?FEEEEF?EDBEE5DB>ECFFFD?AFEEEEEBDEABEEEFEFGGGGGEE=EEDGEFGDDGFGDAFDFEE?EEFFFFFDEEEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_572/1 99 chr2 126452154 60 91M = 126452553 490 TACCTAAAGTCAACAATATGCAATAGTTGTGTCTAGATTAAAGTCTGGGCCATGCAAACATAAAAATCACATAGAGAGTGGCATTATAGGC GGGGGGGGGGGGGGGGGFFGGGGGGGGGEEGGGFFGDGDEGGGEGGEGGGGFBEFGGDDEEEFDFG=BFGF=FEEGGDCEGGGGGE:=BEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_572/2 147 chr2 126452553 60 91M = 126452154 -490 CATAGCACTTATGGTCTCATCAAGGAAATAAACATATACATACATTAGTGTGCAGAAGGGTCTTTATAGTGTTTTTTAGGAGAAACTGTGA DGFAGGGGGGGFFEDDBDGGAGGGGGEDFFDFFFFDGBGFEFFFDFEGGGFGFFGFFGGGGGFGGGFGGGGGGGGEEGGBGGGGGFGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_574/1 83 chr12 57128540 60 91M = 57128155 -476 TTTTTCTCCTTTTGCCCTTTCAGTATAGAGGTTTCTGAAGAATCTGATCCTGAGACCCTCGTTCTAATAGGTCAGTTCCAGACGACCAAAT AD@D?EE=EDE?BBBB@E@@DDEDDDCCCD@BBBBEFEBEFBBEECB??CDDBD7CEEECEDFFFFDFDFFEEDEDEDBEEFBFDFFFFDF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_574/2 163 chr12 57128155 60 91M = 57128540 476 AATGAGTATATAATAAGCATTAAACAATTAGCACAGTTATTGGTCCACAGCAAGTACCCACTAAGTGGTGGTTACTCTTACTATCATCATC DDDD?B@DDBDCDDDEEE=EDDCD:A9C?CC5?B?CC@@@;>B'@6<4GE=GGG>D8GGGDGGDFFGGFEDGFGFBGGGEFGEFFFFEEFEFFGFFGFEGFGGFGGDFGGGDGGFGFFFEFFF XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:78T12 foo_578/2 129 chr2 86499980 37 91M * 0 0 AGGATATAAATAAATTTTTTAAAAAAGGTAAGAAATATTCAGCTTTACTTGGAATCTAAGAAAATCAGTTTTAAAACCAACCAGGTACAAA DDD5DDAEEEGEFCG?EEA?EFEEDDD:5DDDDD?DEEA:DBCCBFFDFFAE5EAF:FFFGEECGFGFGGA?DFFGG?FBDDDDDACD### XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_579/1 99 chr2 225371035 60 91M = 225371427 483 GTGCTAGAAATAAAGGATTGAACAAGGAGACACAGTTCATGTTTACTGTGAAACTTACATAGCAGTGATTGAAAAAAACATGACAAATGGA A?BBAEDEEEAE?EE5CCCCEEDEEBEB:EDDBDA=>C>>=CA;CDA:CC=?C@=C:C:?AA<5B?C5=BBACB=;B:=>CAA5:EA6EEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_579/2 147 chr2 225371427 60 91M = 225371035 -483 TAAAATTTGTGAGATTTGAGTGACATTTTCATGGCCCAATAAGAATCAATATAGTCATTACTTTCTCTACCACCATGCTGACATTTCATCT EAAAA->>@@:8.=B:;898;:+:8B8;9;;@6@@;;>@;;=:?>:9A-?5?5707@;@6?BB>B4A5B;CCB?5ADDDDE;;;;;;5;>; XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_581/2 129 chr10 36765904 37 91M * 0 0 AGCTTTGTAAAGGTGGATATAAAGTCAGTCTTGGCAGGGCACTGTGGCTTATTCCTGTAATCCCAGCACTTTGGGAGGCTGAGGCGGGGGG EGGFGGGGFGDDD@BDDDDDD?DEAD?DDD:DBDCEEAE-C-A>A=B4:=.@?###################################### XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:88T2 foo_582/1 99 chr2 205362768 60 91M = 205363149 472 AAGAACCTGTGACCTCCTCAAGTCCTTTACATAAATTCTCTAACTTGGTGAAGGTGAATTTTTAGTTTTCTTCTTCTTTTTTTTTTTTTTT GGGGGGGGGGGGGGFGGGGGDGEGGGGGGGDGGGEGGGGGFFGGGGGGEGCDEECEC=E:ECCCC=CBBB?CCEFFDGDGGGGGGFEGGEG XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:75T13A0A0 foo_582/2 147 chr2 205363149 60 91M = 205362768 -472 TATAAAATAGTTCATGGAACAATTTAGACCTCTTGAGTTGCATTTTTCTACCAGCTCTTTATCTCTGGTTGTTTGTTTTGCAGTTAAAAAC B?=EDFADGEFEE?FAFEEFFAF?DFF=FFEGGFGGEDE5E;CEEEDFD@EGGGFGEDEB=EG?GFAGGGGGGGGGEFGGGDGEGGFFFBF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_584/1 99 chr6 123435813 60 91M = 123436181 459 CATCATTAACTAAATTTCTGTAAATTTAGAAAACATCATATCAAATAGCTATGAAGAAATGTGTCATATAGAGACATATGAATTTGGTGCT GGGGGGGGGFGFGGGGGGGFFGGGGFGGGGGGFGGEGDGGFFAFFFFFFF?GGGGGFDGGFEFFFDDEEEGBGGFDFFFFGGFGGGDEGDG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_584/2 147 chr6 123436181 60 91M = 123435813 -459 GTGTACTGGGGGTATGGGAATAAACAAAACAATGTCTTTGCCCTTCTGAAAGTCACACTCTAGTGGAGACAACATATAAAAAGCCAGTAAG FEC-EEEGGGGCGDDGGGGGDGGCGFGGGDEGGGGGGGGGDGGGFGFGFGGGGEGFFGCGGGGDGGFGFGGFFEFGFGGGGGGGGFGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_586/1 99 chr2 99374000 60 91M = 99374385 476 ACTAACATGTTTTGTATGTTTTATATATTACATACTGCATTCTTACAGTAAAACAAGCTAGAGAAAAGAAAACGTGATTAAGAAAATCATA GGGGGGGGGGGGGGFFGGGGFFGGGGGGGGGGGGGGGG?GGFGEGFDGGDGGGGGEGGGEFEFEFEAFECA?AE?F=EEEGGDGGGAGGBG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_586/2 147 chr2 99374385 60 91M = 99374000 -476 GGAGAATCAGAACGCTGCTGGCACCCAGAAGCTTCCTGTGTGCTTCTTCCTGATCAACTCCCCTGCCTTCCCCTCCTACCCCCTGACCTTT >?+@A?.0282,77<5??8A=C:CAEBFCF?FFDFFFDD@GDGGBGFAGGGGCCCBE?FFEFGGGEGFGGGGGGGGGGDGGGFGGGGGGGFGFGGGGGGGGGBGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_590/2 163 chr6 158851161 60 91M = 158851552 482 ACCCATTGAATGTATATCCTGAGAAAAATTGGGGCCAAAGAAGCAGGAAAATCTCAAAGCTCTAATGGCAGCCTAAATCCAAGAATTTCAC GGEEGGGGGDGGGGDGFDGGFGGFGFBEBFF=GEGDGG?DGFGBEFFBFFFFFFFDDGFEGAAGDF=F5EC#################### XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:72A6A11 foo_592/1 81 chr18 3888272 37 91M * 0 0 TTGAGATGCTTGACCAGCAATATGTTCCAGAAAATATTAGAAAAATGGATATGTCTCTGTGGTAAACCCTGTCCTAGGCATCTTATATGTA #################BD?BCB5?D<974;>4<;@:2226>=B5=C=C?C==DDD@C@>?CC?CC59=@9?;;<5;9;+;;A<;;;-<>; XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_593/1 83 chr2 153249797 60 91M = 153249400 -488 CTGAGGCAGGAGAATCACCTGAACCCGGGAGGCAGAGGTTGCAGTGAGCCAAGATTGCACCATTGCACTCCAGCTTGGGTAACGAGCAAAA ?5E:EED=FFFBEBCE:?@BB??CB:==>>;:>?5>7<:=C@=@E=:EEFEEBFEGEGEGGDFGFEFGFFGGGFEGGGGGFGGFGGFGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:79G11 foo_595/2 163 chr6 20295183 60 91M = 20295563 471 AATGTTTATTGCAAAGTTAAACTTATTGGGATAAAAATAACCTTTTATGTTTATTCCTCTAAAACATCTTTGACTACCATGACTTGCTCGT FGGEGGGFGGGGAFGGGGFEGFGGDGFFGGDFEGGEGEGGGGGGFGGGDGGGFGDGGGFGBFDFFGGFDGEDEGEFGGBFGEF==:DDBCA XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_597/1 83 chr18 2385287 60 91M = 2384901 -477 GTCCCTGGCTATTGTCTATCAAAGACTCCTCTTATAGCAGCAGCTGTAAACCGGCAGATGTGGAGCAAAATAAATCCAATTCAGATTTTCT ######@,:BBEEEAE????@CBB8ADCCCECCEE?DBEEFFGGEFGGFEGGGEFGGGFGGDGGGGGGGGFGGGGGGGGGEGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:24C66 foo_597/2 163 chr18 2384901 60 91M = 2385287 477 CCCTGCTAAGACTGCATGGTCTCGGGTTATTTTTAGTGGTCTGCTTCCTTCTTTAACTTAATTTTGTGGTGATGTGAATAGAAAGGTTATA GFGEGFGFFGBGDFGDFFFFGGGGGFEFDFGFGGFGEGGGGFFGFGGGGFAGFGAGGGF??GGGGGG?FEFDFFEFEEF=ECEEED?A@:; XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_599/1 83 chr3 25544180 60 91M = 25543812 -459 AAGGGTTTTTTGAGGTTTTCGAGGGTAGATAGATGCTCTTGGCATTCCTCTAACAAACCTAAATGTGATTATTTCTCCTGAGTGACTAAAA >E?EBDEB5DEDDDC?BB@B?EFDDFE?BBCGEFEGBGCFGDGGEGCDDD:EEEAEEFFBFGDGGEFFFFBAGGEGFFFFFFFFBFGBFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_599/2 163 chr3 25543812 60 91M = 25544180 459 TGTAAACCCAGCAGTGGGCCTTTGTCTGTGACTCGATTTCCTCCTTAGGGAGACCTAGCTGGCGAGGCCTGCAACTGAGAAACACCAAAAA EDGFGGDEGBFFFF?EE?EDFFFFDEBABDEEAE?EFDFC?=ED?DEE?-5DCD=ECAEEEEEE=CADC::=@?D?5?############# XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_601/1 83 chr7 72008981 23 91M = 72008612 -460 TAAGTTTTATTTTTTGAGATCGAGTCTTGCTCTGTCACCCAGGCTGGAGTGCAGTGGCGCAATCTTGGCTCACTGCAACCTCCACCTCCTG ####A1;?BEE9>:>>146)%66*262/07(:0.1>(C>>>B@ADAB=D:7CCDACBE:EE<>=>6@?5D=:C:??EBEEEFBAFEFFFFF XT:A:U NM:i:2 SM:i:23 AM:i:0 X0:i:1 X1:i:1 XM:i:2 XO:i:0 XG:i:0 MD:Z:3T16G70 XA:Z:chr7,+74933732,91M,3; foo_601/2 163 chr7 72008612 14 91M = 72008981 460 GGCATTTGGATAATCAGTCCTAAGGCCCCAGGCCCTGCTCTGTCATTGACTGTCATTTTGAGTGAGTCATTTGATCTCTGGGTCTTTTTCC BDGG?GFGGDF?FDFEGFGGE=EE=BEDE,?CCCCFFGGGB=DE:DDCCD?DBDDFDBF55C:C?B:A-5=8==.751/:6?,?=?CB5ABDBDEF:FCCCCAE5CBDE;D?CDFF?E XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_604/1 99 chrY 5048249 23 91M = 5048614 456 AGTATAAAAAAATGTACATAGCACAGCATAAAGGAGTCAATAAGTATAAGACTTCTTCTACATAGATGAACTCATCTAATCTTCTTATAAT FFFFFGGGFFGGGGGGGGGGFEGGGGGGGGGGGDFFEFFFFFDFEGGGGGGGGGGBDGGGGEGDB=FFFEFEF=EGGDFFGGGEGGEGDGF XT:A:R NM:i:0 SM:i:0 AM:i:0 X0:i:2 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chrY,+91039763,91M,0; foo_604/2 147 chrY 5048614 23 91M = 5048249 -456 CTTTAAAGCCTTTCCAGTTGTAGGTGAATGTACTTTTAAATATAGCTGATATAAAGTTTTGTCTTCTAATGATAGTGAAAATTATTGACAA D=5B=EC=CC=D=E=EBBEF=GGGGGGGFGGGFFDGDGFGEGDFGFFDF=FGGGGGDGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGFEGG XT:A:U NM:i:0 SM:i:23 AM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chrY,-91040127,91M,1; foo_606/1 83 chr6 58885728 60 91M = 58885351 -468 GAAAACATAGATAGAATCATTCTCAGAAACAACTTTGTGATGTGTGCGTTGAACTCACCGTCTTTAACCTTTCTTTTGGTAGAGAAGTTTT GGGEEEEEGGGGGGFGDED?DDBFGFEECBECGE?EGGGDGGEGGEGFGGGGEGEGEFGGGFGGGGGGGGAGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:67A23 foo_606/2 163 chr6 58885351 60 91M = 58885728 468 CATTGAAGCCCACAGTAGAAAAGGAAATAACTTCACCTAAAACCTAGACAGAAGCAATCTCAGAAACTACTTTGTGATGTGTACATTCAAC GGGGGGGGEGGGGFGFGGGGGGGGGGGABEGGGGGGGDGGGGGGEGGGGGFDFGGEGGGGEGGGGGGGGGGGGFFGFAGEGEGGGFE?EGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_608/1 99 chr13 76961580 60 91M = 76961935 446 ATGTATCTGTGTGTTTACTGCTCTGCGGAAATACAAACTAGTATAAAACATACAAACTAGTATAAAACATGATCCTCTAGTTAGGGAGAAT A=DABDDD?D;;<;>?@>=C>B;@;C:5CC672@>289;422667;2BEA<<;;6773*;;:?@/7.67-;2=;D-=DDEAE;ECCC:CCCC:ABD?5? XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:2T47T40 foo_610/1 83 chr22 32658358 60 91M = 32657992 -457 TGGTCTTGAACTCCTGACCTCGTGATCTGCCCGCCTTGGCCTCCCAAAGGGCTGGGATTACAGGTGTGAGCCACAGCGCCCGGCTGGTTGT EC=?B??@B?CACA?B5>>?@BEB@AE=CBEDBFEDGFEGEEDFDFGFGGFFDGGGGAGFFDGGGGDGGGGGFGGGGGGFGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_610/2 163 chr22 32657992 60 91M = 32658358 457 CTAATTTTTTTGTATTTTCTAGTAGAGATGGGGTTTCACCGTGCTGGCCAAACTGGTCTCGAACTCCTGACCTCGTGATCCGCCCACCTTA GGEGGGGGGGGGGGGGGGFGGEEGFGGGDGGGGFGEGGGFEFGGGGFGGFGGG-GGFGFFG:GGFFFF=FFGGGGDEDAD,CAAA'B8?@DD8@CD?BEEBEAAEEE?>:===EEED?DEEAEEEA XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_612/2 163 chr1 237354277 60 91M = 237354673 487 AAAGTTGCCACGTAAGACTAAACATAATATGCCATCCAGTCCTAGGCTGAAAACTTTCTACATGGTCCCGAGCTTGGGCAATTCTTCAGTT E:EDDEFFF:BEEDDBBEDDE?BEDB:?DD=D:?ACCCCCFDDFFDBBDADDEEDGEGFDGDEGEFFDDFBA=EDEEDE?=-AAAD=:BAB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_614/2 145 chr13 46459041 37 91M * 0 0 AATTCACATACATTGTTTTTCCTTCTTTGTGAGGTTATTTTGTCAATTAAATGATTTCTTAGTGCCGTATGAGTTATAATAGGGTGGGTAG FFF?FEDD?BADDGGFGEGDFFD=FEFGGGGGDGGGEFFGGGGDFFEDFFFAF;FFFAF?EFDFFEGGAGGGGDBFFGF?GGGGGGFEEFG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_615/1 99 chr6 27942052 60 91M = 27942427 466 TGAGGAAAAAAAAATAATCCAGTACAGAATGACACGGTCATATCAAATTGGCATGTAATAGAGACCCCATCTTGAAACTTGCTACTATATT GGGGGGGGGGGGGGGGGGGGBGGGGEGGGGGGGFGGEEGGD?GFGEGGEFEGGFGCFGEAFEDCCBEFDEGEAEEBFFFFEAGAEE@?DDB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_615/2 147 chr6 27942427 60 91M = 27942052 -466 GACTCTTGCTTGCCTTGCGTTACTCGATTAGGGTTGAGGGAGAACTGCCCGAGATTTTTCGAATGCAAGGAATCTTGCTGCCTGATCTAAC C2>?=?B?EEEDEEDEE:CEEEF?GGEGGFGGGGGDGGGDGGGBGGGGGGGEGGGFGGDGGGFFGEGGGDGFFFGGGGFGGGFGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_617/1 83 chr1 102444187 60 91M = 102443805 -473 CACATTGTGGTTTTTATTTGCATTTCTCTGATGATTAGTGAGGTTGAGAATTTTTCCATATGCTTGTTGGCCATTTGTATATCTTCTTTTG CDDGGGGGGGACEFFFDFFBFG=EFEFGGGEEGGGFDGGGGGGGGDGGFGGGEGGGGGGFGGGGGGGGGFGGGFGGGFGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_617/2 163 chr1 102443805 60 91M = 102444187 473 TTTCATTCCTTTTTATGGCTGAGTAGTATTCCACGGTAAATACATACCACATTTTCTTTATCCACTCATTGATTGATGGGCATTTGGGCTG GFGGGGGGGGFEGGEDGFGGEBE=ED:CCDAC5CCD?DDD=AC=CACC-CD=EEEAECC=??A?-:BC###### XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_619/1 83 chr10 68684819 60 91M = 68684446 -464 GGAGGCTGAGGCAGGTGGATCACCTGAGATCAGGAGTTCAAGACCAGCCTGACCAACAGGGAGAAACCATGTCTCTACTAAAAATACAAAA C:4?4:??=CB?AEFF>?>*;;?>:>:=@?@;?56=@?@C@B=EBEGBGGEEEEGGFFFFFF?GGEGGFGGGGGGGGGGGGGGEGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_619/2 163 chr10 68684446 60 91M = 68684819 464 CTTTTCTATGTTTAGATATGCTTAAGTAAATAAACACCATTGTGTTACAATCCCCTGTAGTATTTAGTAAGTAACATGCTGTACAGGTTTG GGGGGGGEGGGEGGGFFFGFGGGFGGFGGFGGGFGGGGGGGGGDGFGGF?FGFGGGGDGGEFGGGFFFF=EEEEEFGGDGGEGGGFF=FF? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_621/1 83 chr11 88857527 60 91M = 88857151 -467 TATTGTAAATAGATAAAGAATATAATTTCATCAAAGAAATATAACCCATTTCACCCATCAATAAACAAGAAAATTGTGTCTAAATAACAGA BFFFDEEEEEBE=DEEBEFEFDEGGGEE:GGEGFGGGGGGFFFEEFEEACEF@FFFEBEEEEGGDGGGFGGGEGFGGGGGGFFFFDEEEEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_621/2 163 chr11 88857151 60 91M = 88857527 467 ATTCCTTCACAATTTATTGGCAAGTCTCAAACAACTATAATACTAAAAATATATTTTATTTTGAGAGTAAATCAATCATTTTCGCTAGCTT GDGGGGEGGFFGFGGDGGEEEEEEEFGGDEEGAFGDFGDGGG?GEGDD=BCCDFEG?AGGGG@GFFGGGGGGGEGG?GEGDGGDBB5EEEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_623/1 81 chr22 44321715 37 91M * 0 0 CAGCTCAGCGATGTGGATGGAGTCACCTGTGAAGGTGCGGCCGCCCCTGCCTGCTGAGGGGGAAAGCACCCCCTGGTCTTGCCATGACACA #########################################CAB?DDDBABEEDC@DABEEBDDD>A2C?CDBDB5EDEDEEBEEEEEEEA XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:40A50 foo_624/2 145 chr16 82783352 37 91M * 0 0 AGTGATCCCCCCGCCTCAGCCTCCCGAGGAGCTGGGACGACAGGTGGACTCCACTACACCTGGCTAATTTTTGTATTTTTTTTGTAGAGAT ###########ABCB:F-EFFDGGGGEFGAGEEEFFAE-BEDGEGDFFFFFDADCAEAEEEFDGDGBGBFGGGGFGFGGGG:GGEGGGGGG XT:A:U NM:i:3 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:6A1T29T52 foo_625/1 83 chr1 21759810 60 91M = 21759426 -475 ATGTTCCTGGGAGATGGTGAGGCCCAGGGGCCTGTGGGAGGGGTGGAACAGGACACCTAGCTAGGAGCCCCGGGAGCCAGGCTGAGTTGAA C;><55CBE=EEFBAC><>>?'>.<5A>==?2DFCCEEEEGEEGF?EEDEEE?ECEGEGGEFGGGDFGGGGGGGGGFGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_625/2 163 chr1 21759426 60 91M = 21759810 475 CCTGTGCTGTGTGGCCTTAGGCAAGTTGCTCGTTGTCTCTGAGCCTTGGTTTCTTCATTCTGAGATGTGAGGAGAAGACCCCAGGGTCCAC GGGDFGGGBGGGGGFGGGDGEBFFEGFDGGEE?EDEDEE5AEEEEGGDGDGGFGGGFFGGEFFFFFD?DB:B?AAC:=C?EDDEEFEBDD: XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_627/1 99 chr12 107001154 60 91M = 107001506 443 ATCTCTCTACAACAACATAGGCCTCCTCCCCTGCAGCCCAGGTTTTCAAAACCTGACCTGAGAAAGATAAAGAAAATGTTGCTTTTAGAGT FFFEFFFFAFFFFEFFFEFFDE:EDFFFEFEABFDFFFCFFF?EFDEEDCF?AEFFDFD?BC=BE?B?BDBAA?<@=>.=@%>A;:?D2DDDFFFFGGGFGEGGGGGGGFEGGDGGFGGGGDFFFGGGGFEGGFGGGGGGGGGGGGGGGGGGGGFG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:16A74 foo_635/1 99 chr7 96851416 60 91M = 96851798 473 GTTAGCCAGGATGGTCTTGATCTCCTGACCTTGTGATCCGCCCGCCTCGGCCTCCCAAGGGAGGGATTACAGGTGTGAGCCACTGCGCCTG EEEDDEEEEEFAFFFDEFFAFFFBEFFDFFFDFFFFEFDEEB?>BBBB@=:BAABA################################### XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:9A28T52 foo_635/2 147 chr7 96851798 60 91M = 96851416 -473 AGGGAAAGGAAGAAAAACTAATAGATTAACATGGACATCTAATCTGCCTTGTGAAAAGAGAAATTTTCCTAAATTTGCTCCATCTTTCCAA CF=FDFAFE3CDD=B:=@>;;DDDDBA:A?A?BF?GEEEAED?D=-DGGFFGFDGDFFBBFDGFGG?DDEEE:EEE:5AA:FGGGFGBEE: XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_637/1 83 chr5 20705782 60 91M = 20705411 -462 TTTCATTAAAATTTGCAATCTTCAATCGAGAGGTCCTACAAACTTTTGTTATATTTATTCTATGCGTTTTACATTTTTATGCTATTGTATG GFGFGGGGGFGFGGGEFEDEFEGGGEGGGFGBGFGGEFGGGEGGGGDGGGDGGGFGEGGGFGGGGGGGGFGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_637/2 163 chr5 20705411 60 91M = 20705782 462 TAAATAAAATCATATGGTTTTCCTTCTTTTGTATCTGATTCTTTTGATTAATATTAAGAATAGGAGATTCATCCACACTGTGGCAATGAAT GGGGGGGGEGGFGFGGGEGGGGGGGFGGGCED?FFGD=GGGGGGGGDGGFGEBGGGGGDGGGGGGDFAGEGGDGGGGGFBBBEEDFEEGEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_639/1 99 chr8 141319749 60 91M = 141320136 478 CACTCCTGGTTAAGACCTTCCTGCCTCAACTTCGGGTGGTTACACTTACCGCCTGCCATACACAGGTAGGTGAATCTCAGATGACACTCCC GFGGGGGDGGFGGGGGGFFGGGGGGGDGGEGBGFGGDGDDEEEEEABEAEF?BFFE=:BEEEEA:?@@B@:????################ XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_639/2 147 chr8 141320136 60 91M = 141319749 -478 AAACAGAGTTTCCCATCATACCAGCCTTCTTTTAAAAAGTTGTCAGCGGCTCTCCAGTATCTTTCAATGAAGTCCAAGTTACTTCTGGAAC @B=B??A5BCAEE?EEECB@CECC=>>6DC=GDFGFFFA=FAEEEEADFDEFDFBFE=B=EBADEEFABFFDDFFFE=EDEDDBEEFGGGD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_641/1 81 chr2 91684006 29 91M * 0 0 CAGTTTTGAAACACTCTTTTTGTAAAATCTGCAAGAGGATATTTGGATAGCTTTGAGGATTTCGTTGGAAACGGGATTGTCTTCATATAAA A>>>@A.101+5./&-05:>:?=,?BBA?DEDFEEE??C=CBBBC?C=EBA@@?=@BBCEEDBACBCBECBDBBBBFFCBFFFDAFFFFEF XT:A:R NM:i:1 SM:i:0 AM:i:0 X0:i:5 X1:i:6 XM:i:1 XO:i:0 XG:i:0 MD:Z:63A27 foo_642/1 99 chr2 196243843 60 91M = 196244217 465 TTTTACTAATTGAAGGTTGGAATAAACTTCATGTGGGGAATAAAAATTAAGTTTTGGATGAAATATAACACAGCTACAGAAAAGTACTCAA GGGGGGGGFGGGGGGGGGGGGGGGFDGGGGFGGFGFGGEG?EEEAAGGFDGDGFGGF?GGEEEFFCEDBDEEEEEGGGDEADFFEFAGGEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_642/2 147 chr2 196244217 60 91M = 196243843 -465 GTAAATTAGCCCCACTCTTAATGCAGAAATTATACAGTGATAGAATACTTAGGCTTTTAACTGTGGTTATGTATTGGAAATCAACAGCAAA DACDB?EFBEFFFAFEGGFFGGGFGGGFGFGGFGGFGDGGGFGGGGGGFGEGGGGGGDGGGEGGGFGDGGGGGFEGGGGGGGGGFGGGDGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_644/1 99 chr3 179457661 60 91M = 179458030 460 CTTGTATTCAGACTATATAATGAATTCTCAAACAACAAACAATGGAACATTAATAAATTAAAAATTGGACAAAAGATTTGAACATTTCACC GGGGGGGGGGGGGGGGGGGGGGGFGGFGGFDGGGGGGGGGGDGGGGGGGGGGEGGGGGGGFGGEDGFGGGFEEEGDGGGGGGGGGGFFGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_644/2 147 chr3 179458030 60 91M = 179457661 -460 TAGTATTTGGAGATGAGAACTTTGAGTGGTTATGAGATCATGAAGGTGGAACTGTCATGATAGGATTAGTGCCCTTCTAAGAAGAGACATA EGGEBEGEGFEGGDGGFGGGGGFGGGFGGGGGGFGGGAGFGGFGGGGGGGGGFGAGGGGGFGGGGGGGGGBGGGEFFGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_646/1 99 chr5 92679006 60 91M = 92679385 470 GGAGACAGAAATAAGGACTGACAGAATGACAGAGCTCAGAGACGGCTCTTGAAAAGAGCAAGAAAGAAGCTGGCGGAATCTACTTGCTATT GGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGFGEGGEGFGBBCB?FDDFFEEEA?ACB5??(=.=;9=:>???:@@@AC=:?CC?C=::>?-A5CA?DAADDADD544A?############# XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_650/1 65 chr1 233845047 37 91M * 0 0 GTGTGGGGATATAGTCTCCTCTGTGTGACCATTATCATGGATTATCCTGGTGGAACCAATTACCATGGATTATCACAAGGGACCTTGTAAG 757577=7767;2606887;27>5,<>5?<====9ABAA:,=@@@3*872);;-<:6*40?::<<,=B5=>>41+898<:?:C-?66>6<= XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_651/1 99 chr14 29389867 60 91M = 29390236 460 AATGGGACAGAATAGAGGACCCAGAAATAAAGCCACACACAACCAACCATCTGATCTTTGACAAAGTCAACAAAAATAAGCAATGGGGAAA GGGGGGFGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGDGGGFGGGGFEGGGGGGFGGGGGGGGBGDGFEGFEDCFGAGGGGGEGBCFEFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_651/2 147 chr14 29390236 60 91M = 29389867 -460 GTAAACAGATAACCTACAGAATGGGAGAAAATATTCACAAACTGTGCATCCTACAAAGGTCTAATATCCAGAATCTGTAAGAAACTTAAAT FFFEDBFGGDGFFFFFFBDFFDGGGGGG?EGGGGFGGGGGGGGGDFGGFGGFGGGGGGGFGGGGDGGGDGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_653/1 83 chr3 47502467 60 91M = 47502081 -477 TGTGCTGTCCGTTTTAAGCCAACTGGGATTTGAGGGCTGTTTGTTACTACAGGAGAGCTGTAAGTGGTTACGAGTCAGGATCGTAACCTCA C?BA-AGEFGDFFFFF?CD?CCEBECCEEEAEEGGFF?FFFAGBGGGGGGEGGGGGGGGBGGGFEGFGGGGGGGGGGGGFGGGGGFGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_653/2 163 chr3 47502081 60 91M = 47502467 477 AAGAAAGAAATGTCCAATCACTTGAGTAAGATCACTATAGTCAGTTATTTCGTTGTTGTCTTCTCTGTAAGCTGATCAATCATACACATGC GFGGGGGGGGGBGGGGG?GGGGGGGGGGFGFFGGGGGGGGGGGGGFFGDGGGGGGFGGGGGGGGGGGGEFGGGG5DFFFDGGEGG:EEDFE XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:2A0T87 foo_655/1 83 chr7 140220044 60 91M = 140219671 -464 AGCCCGGAGGAAAGCAAAAGCCAGGTCAGGCAACATGGTGGCCTGAAGTTTGAATATACTCCCTGATACACAGATAGACACACAAAGAAAG BEGGBFFGGEGFEGAGEDEDEFFEE@EDEEFFEGFDGGEGGEEEBEFFF=DFFFDFFDFFFGGGGFGGGFGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_655/2 163 chr7 140219671 60 91M = 140220044 464 TCAGGTGATCCATCTGCCTCGGCCTCCCAAAGTGCTGAAATAAGAGGCGTGAGCCACCATGCCCAGCCATCTGTGGCATTTTTGCTTGAGG FFFFFEGGGGGFGGGGGGEGGGEGGGGFDGGGDGGEGGGGGEGGFGGEGFGEG?G:CEE?FAFF?GGEFGGCG@FGGF?GFGGGEFE=AE? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_657/1 83 chr6 136312167 60 91M = 136311784 -474 TAGATTTGAATTCACTTAAAGTAAGATCAATTATCTTTCCATCTCTTCTAGATTATGTGGTATGGTTTATATAGCTGCCAGAAAACAATCC BFF?DFEGGGGEGEGFDFGFDEEGEFEEGDGDFEFFFDFDE?FDDEEAEGGGGEGFGGFGGEAGEGFGG=GGGFGGGGFGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_657/2 163 chr6 136311784 60 91M = 136312167 474 AAAAAAATCCTTTGTTAAGCCATGTGTCCCAATTTTGGATTTGGGAAACTGTTTTACCATCACATCAAATAATCTATTCAACAATTAACTC FGGGGGGGGGGGGGEGGGGGGGFGGGGGGGGGGGEGFGBGDFFFEGGFEAGGGGF?EFFEGFGGGGGGFGGGGGDEGDGEEGFGGGGGFG: XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:31G59 foo_659/1 99 chr5 29693887 60 91M = 29694260 464 GGAAAAAAGTTAAAAATAGCATAAAAATAAAATTTAAAAAAATTGTGTTAAACATTAAAAATACACAAACCTAATGGTTGGAAAATGAAAA GGGGGGGGGGGGGGGGGGGGGGFGEGGGGGGGGGGGGGFGGEGGGGGGGGGGFGGGGGGGGFAGGFFG?GGGEDGDFEFFGFGGFEEGDFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_659/2 147 chr5 29694260 60 91M = 29693887 -464 TTTGTAAATTTTATGACATTTATGAAAACAGTGTTATTGAATGCAAATGTTTAGCATATGTTCATATTTTAAAGGGATAGAGACATGGAAA GGGFGGG>GGGFGGFGDGGFGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGDGGGGGGGGGGGGGGGGFGFGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_661/1 99 chr7 142210459 60 91M = 142210832 464 CTTCTCTCTTTTTTCTGTTTCCCTGAAGATTGAGCTCCCAACCCCCAAGTACGAAATAGGCTAAACCAATAAAAAATTGTGTGTTGGGCCT GGFGGGGGGGGGGGGGGGGGGGGGGDFGEGGFGGEFGGDGGGGGGGFEGEGFGGEGF?EEDDGGE:DGCEBGGEE:5C@:EBCBCD?FEFE XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:77G13 foo_661/2 147 chr7 142210832 60 91M = 142210459 -464 CCCAGTGCAGGACAGAGGATGCGGGCAGACCTATGGGTTACAATGTCTGGTCATTTCCCAATTCCAGATTAAACTGTCACCTGTTTTACCT ?DFB?FEFAFBAGGGGFEDGGGGGGGFF5FDGGGGGGGGFGGGGGFEGGGGGGGGGGEGGGGGGGEGGGGGFBFFFGGEGGGGGGGGDFDG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_663/1 83 chr10 46796347 20 91M = 46795944 -494 CGATTCTCATGCCTCAGCCTCCTGAGTAGCTGGGATTACAGGTGCGTGCCACCATACCTGGCTAATTTTCGTATTTTTATGGGATTTTACC ;C@EACDGCDDECCDA?EACECEFDEEEFBGFGGEGFCEEGFEFDGFFFFCEEBECFFFFEFGGGGGGGGGGGGGGGGGGGGGGGGGGEGG XT:A:U NM:i:0 SM:i:20 AM:i:0 X0:i:1 X1:i:2 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chr10,-46314267,91M,1;chr10,-48704017,91M,1; foo_663/2 163 chr10 46795944 20 91M = 46796347 494 GTAAATGTCGTATATGCCTTTTAAACATTTTGGAGATTACCCATAGTAGGCCTGTCCCTTGGGTAAGGCAATTAGGGCAGCTTTCCTGGGC E5EEEFFFFFGGGGDGGGGGGEGGGGGGFGGFGGFBFGEGGGGGGGGGGGGGFGGGGGGGGGG=FGGEGGFGGGEBGE?GBEFGDGBEDFE XT:A:R NM:i:0 SM:i:0 AM:i:0 X0:i:3 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chr10,+48703612,91M,0;chr10,+46313861,91M,0; foo_665/1 83 chr5 150028023 60 91M = 150027640 -474 AACTGCTTTAGTGAAAATTACCAATGACCTTGTATTGCTAAATATAATAGTGGATTTTCAGCCCTCATCTTCCTTGACACATCAGCAGCAC GEGGGGEGGEGEFGGGEEGBGGGGEG@GGBFGGFGGEGGGFGFGGGGGGEGGGGGGGGEGGGGGFGGGGDFGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_665/2 163 chr5 150027640 60 91M = 150028023 474 ATAATAAAAGGGCATTTCCTTACATGCCACAAAGATATTAGCATACTGAACAAAATTTGCAGTAATGATTTGATACCATTTAATAAGCGGT GGGGGGGGGGGFGGGGGGGEGGGGGGGGGEGFGGGEGGGGGGGDGGGGEGGGGGEFGGGGGGGFGGGGGFDGGGGGGGGGGGFGGGGF?FE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_667/1 83 chr5 130967669 60 91M = 130967305 -455 AGTTTATGGGTTTTCTCCACAGCAGCTTAGGCCAAATCAAGATTCCTGCTCCTTTCACTTGTATAATACAACACTATAATACAACATACCA E@@A-CEECB=DDBC:<7;??B(<-=A=CC@?-5A==CDEBDC=DDEEEAEBDDDDAAECEEEDEE=DEEEDDDD?DDDDDEEDEED?BA= XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:22G68 foo_667/2 163 chr5 130967305 60 91M = 130967669 455 CAGTTTGCAATAAAAAATATTTTGAGTATTACAAAATGCTAATTTTTCAGTTTTCTTTCTTCCTGATATAATTATTTGGCATACACTAACA DDDDDAADD=D:D>@>@@6@DB??:DCDDDBEBAB?EEEDECDEBEEA?EDFFFEFGGFFBDEEEDEDEEEDEEEDGGGFDFDFD5FDDAG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_669/1 83 chr9 88092094 60 91M = 88091712 -473 TGTTTAAAAAAGAAGGGCACAGGGATGTGCGTCTTCTGAGTCCTCAAACATCCAGAATGTTCGTCCAGCAAGCACCCCCTGAGCCTCCCTG ECCC=A>>BC:CDEFECCACCE?CCCE@EEEEAEDEEEEEEFD??FGEFBEGGGGGGEGGEGGFGGGGEGGGGEGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_669/2 163 chr9 88091712 60 91M = 88092094 473 AATTACAGGGATAAAATTAAAAAACAGAAGCACCCAGGCAGAAAAACAAAACCTACAAGAGAACAAAAATCAGCCTGGTCTTAGACTTGTC FGGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGDGGGGGGGGGFGGGGGGGDFFFGGDGGGDEGGFGGEGGGEGFGDDDEE=GGFGGGFEFD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_671/1 99 chrX 34997401 60 91M = 34997773 463 TATGGGCAAGGACTTCATGTCTAAAGCACAAAAAGCAATGGCAACAAAAGCCAAAATTGACAAATGAGATCTAATTAAACTAAAGAGCTTC GGGGGGGGGGGGGGGGGFGGGGGGGFFGGGGGGGGDGGGGFGGGDGFGGEGGGDGFGGGGGGFFGFDFADEECEEGGGGGGGGFGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_671/2 147 chrX 34997773 60 91M = 34997401 -463 TTTGGAACGTCTTTAGAATTTTTTCAAATATGAGATCATATCATCTGCAAACAAGGGTAATTTGACTTCTTTAATTCCAATTTCAGTGCCT FF=GGGFDGGFFDGGGGGGBGGGDGGGGGEGGGGFGGGGGGGDGGGGGGGGGGGGGGFGGGFGGGGGGGGGGFGGGGGGGGGGGGGGEGGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_673/1 99 chr8 104387376 60 91M = 104387743 458 GTCATGTGTTCATTCATATTTATTGAATATGTACCTGAATATGTAGCTTTCCCAGAATTACAGTTTTTCAAGTTCTAAGGTTTACATTAAA DBDDDDBDBDD:DDD7C1CBD=DDD>@C,CDBBDDBD=BCCBCBCD@BCCBCB:BBBBBBBBC@BBBBBB?B@BBDCCDDABDBBBBCBBC XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:48C42 foo_673/2 147 chr8 104387743 60 91M = 104387376 -458 CCACTTTCCTATACAGTAGTCCATTGACATTCTCAAAGAATTTACTCAAGACTTTTGTAACCACTTAAACAGTATTTGCCCAGTGAAGTCT CB-AAA=EBACDAEDFFDGEGDFFFFBDDFDGGEGGGDGEGEEEEEEFFFFBFFFFEGGGGGGGGGGGGFGGGDGGGGGGGGGDGGGGGG= XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_675/1 99 chr3 85782589 60 91M = 85782968 470 TAAATGATATGATTACAGGTGATTTATTTTCTTCCTGGGACTTAATGAATATGTATTTTGTTTAAATTTATAAAGCACCATAAATGTTAAT GGGGGGGGGGGEGGGGGGGDEEEEEEGGGGFGGGGFGGEFFGGGFGGEGGGGEEFFGGGGAAGCEEFEGFGEGE:GGGGGFGGDBGEGFEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_675/2 147 chr3 85782968 60 91M = 85782589 -470 GCATAAACTGCCATAGCTTGAGTGTAGAATTATTTAATTTATTTCAAAAATTCAAATCAGAATTAATGTAATACATTCTGAAAAATTCAGT GFDDDEDFFDFGAGGFG=FGGGGFGGDEGGGGGFGGGFEGGEGGGGFFFFEBGGGGGFGGGGGGGGGGGGGGEGGGFGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_677/1 99 chr9 100178052 60 91M = 100178403 442 ATAGCTACTTCACTTTTTCTCCTAAAGTCTTACGTGTGTTTATTTTTCTGATTTAACATCTTTCTTTTTCTCCCCTTAGCCATCAGGTACT GGGGFGGGGGGGGGGGGGGGGGGGGGFFGGGFGGGGFGGGGGGGGGGGGGGGFGGFFGGDFFGFEGGGE@GEEGGGFEBFGGGFEGEDFFA XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_677/2 147 chr9 100178403 60 91M = 100178052 -442 TGGCAGGCCTGTTCTGACAGATGACGGATGGGACTGCTGTGCTCCGGGTCTCCTTGTTGCCACAGATTCATGTGGGCTTCAGGCCAGCAAG DFFFE:EEEABG5?BFEFEGEEGEGGGF?GGFEFF?GBDDDFFEFFEE:EEGFGFGGGFEGEGGGGGGGDGGBGGFGGFGGGGGGGGAGFG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:26C64 foo_679/1 83 chr3 104658691 60 91M = 104658326 -456 TTTCAGTCATAGAGAGAGTGTATTCATCATTAATAGATTAATGCTCTCCCTAGGGGAGGGATGAATGAATTCTCACCTAGTTAGTTCCCAC FE=EAEFD=BDEE?EEEEAEEECBECBCDB5FD=FFGGGFGDFGGFFFFFBEGGGFGGGGGG?FGGEGGGGEDG?GGFGGGGGGGFGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_679/2 163 chr3 104658326 60 91M = 104658691 456 AAAAGAACCAGTAGGAGACACATCCACACCCACACACCCCCACCCCCACCACCATCCACACAGAGCAAGGCATTGACTTACACCACTCTGG AFFDFECFGGE:DEEFDFFFFFGFGGDGGGFFFBFGGAF>@@<@?BCDG@>E-EDDEEEDGGGDGFGAEDEAD:?AAA5ADDDDD=EDEEF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_681/1 83 chr10 758714 60 91M = 758338 -467 AAAGATGACGCAGGCAGAGATGTTCTCCTGTCCTTACTCAACACCACCTGAAATGCTGGTAACAACTGTCCATGTGGCAGTGACTCCCACT FCBCA5BB?EECGFFGCBCB?E=BEAEBEE?A?CBAA@AD@ABEEBFFFFFGG?EGFFFFFDDDBD?FDGGFGGFGFDGGGFFFFFGGAGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:11G79 foo_681/2 163 chr10 758338 60 91M = 758714 467 TCTGAAGCTCAGCAAGAGAGCGTGTGGCCCCCAGGCGCCCTTGCCCAGAGGTGTCCAGTTCCCTCTCGCTGGCCTCCATCTGGTCTGGACT GGGGGGGGGGGGGFGGFGBAFF?DFGFBFFGG=GGBFFDF:EEEEFFF?FD:DADEEDCCFDFFFGGCFFADEF:FFFEFEFF:FEEBAE5 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_683/1 83 chr22 23283893 60 91M = 23283507 -477 GCATACCAGTCAGGTGCTGTTATAGGCTCTGGGGGTGTGGGCATGAACGAGACACATAATCTTGAGGCAGAAAAGCCATTTGGAGGTTTGG @BBB,CCDFBDEFFFEAFEEEDDCF?CEC=GEEEFEEGGGEDEEEG?EGFGGGEEFGFGGGGGGGGFGDGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_683/2 163 chr22 23283507 60 91M = 23283893 477 CAGCTTTGGTGTTTTTGCTAAGTTTTATGGAGTGCCAGGCCTTAAAGGAAGCATGATGTGTGCCGCATGCCTTTTCAGACTCTGAAAGATG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDFDFFGGGGFFGGGGGGGGGGGGGGFGGGGFGGEGEEEDEGGGGEGG?GGAGGGGGFAGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:64A26 foo_685/1 99 chr13 112558128 60 91M = 112558510 473 GGATTTGTAGCATCTGTCGATATCCAAAAACAATACAGTCTTCTAAAATCTATAGATTTAGTGATTGAGCAATCTTTCTGACTCACGTACT GGGGGGGGFGGGGGGFGGGFGGGGGFGFGGGGGGCGGGGGGGGGFGFGEGFEGBG:EEECGEGDAEACE@?AAAACDBCBBGFFEAEBAED XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_685/2 147 chr13 112558510 60 91M = 112558128 -473 CTGTTAAAAATAGAAGCAACCGTGATAGAGATTTAAAACGCAGATCTTGTTTTGCTTGTCGGGCCAGCTTCTGTGATTAGTGGGTGATGAC GFDE>CFFF?=EEG=EGGBGGGEGEFGGGGGGA?EFFEEEBFFFFEEFDBDGBDGFFG?GFEFGGCGGGGGFGDGEGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_687/1 83 chr16 53565823 60 91M = 53565447 -467 AGTGCCAAACACATTTGTACTACAGAGCTTCCCAACTTTAACAATGGGGCATTCTGAATCTTTGGATTTAAAAAGTCATAGTTCTCAAATT GGEEEGFEEFEFFFFEFDDDEEEEGEGEEGEEGFEGGEGGEGGGFGGGGGGGGGGGGGGGGGFGGGGGFGGGGGGFGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_687/2 163 chr16 53565447 60 91M = 53565823 467 TGAATAAATCTTTCTTATATTAAACCACTGAGGCTTTTAAAGTTATTCGTTGCGCAGCATGGCCTAGCCTGTCCTGACTGACACACTTAGT FGGGGGGFGGGGGGGGFGGGGGGGGFFFEGGGEGGGGGGGGGGGGGGGGGGGFGDGGGGGEGEFGGGEG?GGGGFGGGFBBGGB?FDFEF: XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_689/1 83 chr1 19767185 60 91M = 19766786 -490 GGAAAAAAAAGTCATTAAAATGTCTTGATCATGCCAGGAACTCTGCCAGGAACTTTATATTCATGACCTCATTACTCCTCACAATTGCAAA D?;ACEAEE-ECDCE=BBB?E=CAB:AA=DDCDDEEEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_689/2 163 chr1 19766786 60 91M = 19767185 490 CATGTTGGGAGACCAAGGAGAGTGGAGTGCTTGAGCCCAGGAGTTACCACCCTGGGCCACATGGTGAAACCCCGTCTCTACAAAAAAAAAA DDADDDDD=?EE?EDCA:->-4'6424,4:?<9<:7B7;=6.>/26?665B;?>AD5C5?,1530>29>9CCCC:?AA?A########### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:18T72 foo_691/1 99 chr2 135944461 60 91M = 135944864 494 CACGCCTGTAATCCCAGTACTTTGGGAGGCCGAGGCAGGTGGATCACCTGAAGTCAGGAGTTTGAGATTAGCCTGACCAATATAGTGAAAT GGFGGGGGGGGGGGGGGGGGGEEGGGEGGGGGGGFFFFF4BCAACDEECECBBCDECE?C9AAA@;9':=98;8<>;?@=@@A XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_691/2 147 chr2 135944864 60 91M = 135944461 -494 TGGAGTCTCACTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCAATCTCGACTCACTGCAAGCTCTGCCTCCTGGGTTCACAGCATTCTCCTA ############??CC=A@A?AD:@BC@@?@CDCE:GDCBGGG?GGFFFFDFEGGGGGEDEFFFFDGGFFFGGGGGEGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_693/1 83 chr12 106669874 60 91M = 106669496 -469 TTGCAGTGTCCAAGGCCACCTCATGGCCATCACTGGGATGAAGAATTTGGGCACCATGTGTGTTCACAGAGCGAGAGTCTATTGAAACAAT ###############?4';8:>5=@@3,=6++7777:B;9>?:6*??AA:?8.665BAD=D:DDBD:A::ADADBDBCDDDDD?DD?DDDD XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:4T39C46 foo_693/2 163 chr12 106669496 60 91M = 106669874 469 GGAGGGAAACATATTCTAGGGAGGCAAGGTTGTGTGAGGTAGAGAGTGCCACGCTTACAGAGTTGGTGCTCATGGCCACCGTGTGAATGGG 776,++:;77;-=BBAAAAAC5C??=?B>4==?B8A:AA?:AAAA?-;9;5>-5BC5A<:;B7B########################### XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:11G67A10A0 foo_695/1 83 chr22 31739572 60 91M = 31739200 -463 AATTTTGAAATATACAATGCATTATTTATAATGCATTATAGTGACTGTAAAGTCACTATTCTGTGCAAAAGATCACAAGGGCTTATCTCTC EECCE:BEECBCBB:EBA:BBCCEDE?EDFECCBBCEBEFGEEEGEEEFFFGEGEGGGGGGGGGGGFFFFEGAGEGGGGGFFGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_695/2 163 chr22 31739200 60 91M = 31739572 463 AAAGGCTTCTCAGGTAACTTCTGCCCACTACAGGCCCCAGTGGAACTTACAGGCTGGCAAAGGGAGGCCAATCATGAACATGCCGACAAAC GGGGGGGGFGGGGGEGGEGGG?FGGGDGFFEEEEEGGGFGEGGGGGGGGFDE?DGEE--A?1?######################## XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:83A7 foo_697/1 99 chr4 81869042 60 91M = 81869422 471 TTCACTGGTGTGTGTCTACCTTTCTAATATCTTGCTTTGCCCTTTGCACCAATGATATCATCAGTTTTTAAAAGATGCAATACCTAAAAGC GGGGGGGGGGFFFGGGGGGGGGGGGGGGFGGGGEEGGGDGGGFGEAEEFGEBEEE?EEECEEBECDFFFDFFDEFFAFFBGEGGD?FF5DF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_697/2 147 chr4 81869422 60 91M = 81869042 -471 ATTCCACAACCTCAACAGTGCTACCTTAACCCATGCCTGAAAATAAATTCACAGGTGTTTGTGATAGGCAGCCTCTAAGATGGCTCTCAAT ######BA?C@;?A>:@>@5D-DDBD5C=FFEEEEE5EEDDDC==BE?CEEEEE?:DEDGAGGEGGBGGGGGGGGGGGFGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_699/2 129 chr6 11486765 25 91M * 0 0 GGTTGGGTAGTGGGGTAACAGGTGGATGTGTATTTATGTTTTTATTCCTATGATTATTTAGAATATATGGATAACTTTTATATCTTTTTTT AC?C5;@>=@077=7.<<99670786778@@C?AC@=C>6CA<<<885B=CAAAC:DD>DEEEBEEBDEA=DD5?################ XT:A:U NM:i:4 SM:i:25 AM:i:0 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:73C3A5A0C6 foo_700/1 99 chr4 110701520 60 91M = 110701895 466 AAATTTCCATCCTGAAATCTGCGAGCACCTCCCTTTCCCAGAAGGAAGAAAAACTCGCGTTTTGTTCAGACACCCGAGAGACGGGTGAATC GGGGGFGFGGGGGGG?GGGGGFGGGGGGGFGGGGGGGGGGGAGGGDGGFEGGGGDFGBGDGGFGEGGEABEEEBEEEEEECEEEE###### XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_700/2 147 chr4 110701895 60 91M = 110701520 -466 CAGATTGTTAAAGGCTTAAAAGCCAAAAAGTAAGCAGTGTGATTCTCTTGTCACTTTCACTTCCTGTTTATTTTGTTCTTTTTTTTCCTTC EFFDDFFAFFFEE:EFFFFFBB=GGGFGGEGGGGFGFFFFAFFAFFBEEEAEAEDEEEAEBFGFGGCGFGGGAGGGFGFFGEEGGDEFFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_702/1 83 chr6 117747935 60 91M = 117747541 -485 TGTACAAGTTTGTGTTGGTATTGGTATTTGTGTTTGAGAGAGAAAGAAATATCTCATGGCTTTGCAGTATTGTGTGTTTTATGTGACACGG FEEEFFEFFFFEEEEEEBEFEEEBEE:AEFFFFBEFFFFEFEEFFEFFFFBFFFEFFFFDFDFFDFFFFDFFFFFFFFFFFEEDEBCCCAC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_702/2 163 chr6 117747541 60 91M = 117747935 485 ACAATGTTCCTATGTAGGAAATGATTTTTTTTAAATGTGATTTCTGTAGCTATGGATAGGCTTCGACATTTAACCAGTTCTCTTTTCAACA GEEGGFFFGDGEFD?FEFEFGA:GFEDGGGE>>FDFFDFEDEEEE?EE=EDFEFDFGGAF############################### XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:70C7C12 foo_704/1 81 chr17 12277024 37 91M * 0 0 TAGATTTTCACGTGCAATTTTATAACTTTTTCATGCAAACTGTATGTTGGATTTGGCCAAGGGACTTCCAGTTTGTGATTTCTGGTTTCGA DC>@<4<>3-1:B>6;37385,74+///32/:><>349=:*?ACAC?A?A=;89;86266-5BB;=5B==? XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:43G47 foo_705/1 65 chr15 45104478 37 91M * 0 0 AGAAATGGTACTGGCATAAAAACAGACACATTGGCCAATCAAACAGGATAATGAGACCAGAAATGAACCCAGGCATTTATGGTCCATTGAT A:)85@55-@7BB=9BA?C>4A.A################################################################### XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:55C28A6 foo_706/1 83 chr14 74316437 60 91M = 74316050 -478 TTTAGTAGAGACGGGGTTTCACCATGTTGACCAGGCTGGTCTTGGACTCCTGACTTCAAGTGATCCACCCGCCTTGGCCTCCCAAAGTTCT ED:BBAFFFAEFFFEFB@5=:CEBBEE:BBEABB5BBB?BEDAFEBFDFFEGDGFGEEBE?AA4?A?EEEEGFGGFFDGGGEGFGGGGGGD XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_706/2 163 chr14 74316050 60 91M = 74316437 478 AGAAACGGATTTTTCATAGAGTGAAAAGGTCTATAACTTAAGGTTCACTTTTTTTTTTTTTTTTTCCAGCCAGGGTCCCCTTCTGTTGCCC GFGGGGGFGDDGGGGFGDGFE?FBEAACC=FFFFFGGGDBEEEEEFFFFDGGFGGGGGD>EECE########################### XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:69A7T1A0C10 foo_708/1 83 chr17 7727714 60 91M = 7727353 -452 CCCGCTTCTCATTTAAAAGATATGTACACAATTAAATGCGTTGTGACTTCAATGAGGTATGTAGAAACAGAGTTATATATAAACATATATT FGGAGGEGGDGFGGGGFFEGGEEGFFGFGGFEDGGGGGGGGFGFEEGGGGGGFGGGGGFGGGGGGGDGGGGGGGGGGGGGGGGGGGGEGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_708/2 163 chr17 7727353 60 91M = 7727714 452 GAGATCATGGAGGGAAGGCAAATATACTGAAACGGAAGAAAAGAGAACATAATTTCACGATCTTCAGATTTGACTCTTTCCTCTTTATCCT GGGGGGGGGGGGGGFGGGGFGGGGGGFGGGGFGFGGDFFGGEFFGGGFGGFGGEFGGGGGGGGGGGGGGGGDEEBGEFGEGFGEGGEBDFE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_710/1 99 chr4 90623029 60 91M = 90623427 489 CAAAACTTGATATAGTGGAAAAATGGTTAGTTATTGCTAGAATTAGGATGAAAATTAAATTCTAGGCAGAATCTCAAGTAGCATGTATATT GGGGGGGGGGGGGGGDGGBGFFFFFF@FEF>CA@CEEECC:BBD??A>;>@B@B?EDFFDEDDDCE@CD?C?BAA>AA=?@@:@>>@@>>A XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_710/2 147 chr4 90623427 60 91M = 90623029 -489 TTCAAGGACTGAATATGATAAAATATATTTGGGGAGGAAATGTAAAGTATTTTAAAATTGACAGTAAGTATAAGAAGTAATTTCTGAGGGC GFFEFFGFGGGGGGGGGGGGGGDBGGEGGGGGGGEGGGGGGGFGFGGGGFGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_712/1 99 chr14 53456345 60 91M = 53456729 475 AGGGGTAAAAAATTATTTTCCATCAAATAATAATCATTCTCCTAAAATCTTAGTCTCGCATTCCTTGAAATTAAATTTTTTCCTAAATTTT GGGGGGGGGGGGGGGGGFGGEGGGGGGDGGGGGGGGGGGGGGGGDGF?EGGGGEGFGFGDFBFFFFFE:GFFFEEGEGGFGGGGGGGGFGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_712/2 147 chr14 53456729 60 91M = 53456345 -475 TTGATCTAACACTCACGTAAAATCTGTTTACTAAGCTCATCCGCAATTAAATAAATACAAATAAAAACAACAGAGAGATACTTTTGCCATC GEEEAEAECEAFEFEDGFGGGGEGFGGGGGGGGGGGDGGEFGGGGGGGGGFDGGGGGGGGECGGGGFGGGFGGGGGGGGAGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_714/1 83 chr2 49827980 60 91M = 49827618 -453 AACACAACTTAGTTTCCTTATTTGCAATCTGGGTAGACAGATTGAAGTCTTTTACTGATTTTTTAAAGATGAATTGTTTGATTTCTTACAA EFFFFGEEE=DGDFDGGGGGFGGGFGFGFDGEGGGGGGGGFGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_714/2 163 chr2 49827618 60 91M = 49827980 453 TATGAAATCCAGAATATAGTTTGTTCAATTTTTATGGAAAGTGACATTTAACAAAGAGTAAAGCAAGAATATTTTATTCCCTGATTTTTGT GGGGGGGGGGGGGGGFGGGEGGGGGGGGGGGGGGGGGGGGGBGFGGGFGGGGGGGGGGEGGGGFGEGGFFGGGGGDGFGGGDGGGFGGFG@ XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_716/1 99 chr4 22642402 60 91M = 22642765 454 TGCACATCTGTTTTTGTTGATTGCATGGAATCTGCTGAAATATTCTTGAAAGTACAACTAGAAATAGGTGTAAAAACCACCTTTCCATTGA GGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGFGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGFDGFFFFFGGGGGEGGGGGGFGFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_716/2 147 chr4 22642765 60 91M = 22642402 -454 AAAGAGAGACTTGAGGAAGAGAGACAAGAAAGCTCTTGGAATGATTGAGAAGAAAATTAAAACGCCCAAAGTAAAGTACAATCTTATGTTA EGGGGGGGGGAGEGGGGGGGGGGGGF?GGGGGGFFGGGGGGGFGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGFGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_718/1 83 chr17 64898632 60 91M = 64898252 -471 TCATGAAGGCTTAGACCGGCCCCTGCCTCTGCCTCTTCCTCCTTCTAGTTTATGCTCAGGTACACGAGAGCATTTACGGTCATTACATAAC ###################?(BABBBEADBEEGEFEDEGEGGEEGGGGGFGDCDCD>BBB7BDCDDGFGFGFGFGGGGGGGGGEGDGGGGG XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:15T4A70 foo_718/2 163 chr17 64898252 60 91M = 64898632 471 GGAATCTGAGATTGGCATGCCAGCTAACTATTAGCTATAAAGCAAATTATTTATTTTATGTATTTAATTTTTTGAGACAGAGTTGCACTCT GGGGEGGGGEGGGGGGFGGGGEGGGGGGGGGGGGGFGGGGGGGGGGGGGEGGGGGGGGGBGGBGGGGGGGGAGGD?GGGGBDBEFGEBEGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:59T31 foo_720/1 99 chr5 31485788 60 91M = 31486161 464 ATATCCAGAATCTACAAGGAACTAAAACAAATCAGCAAGAAAAAATAATAACAATAATAATCCCATTAAAAAGTGGGCAAATGACATGAAC GGGGGGG-FGGGGGGGGGGGGGGGGGGGGGGGDGFGGGGGGGG:> XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_728/2 147 chr10 74024041 60 91M = 74023626 -506 ACTGAAGTATCAACAAGAAGATCCACATAAAAAGAAAGCACTATAAATGGTGCTGGACAAACTGGGTATCTAGATGCAAAAGAGAAAGTTG CC?BEA=AEE:FEFFEGGG:FGFFEGGGFFGGGGGDFGGGGGGDGGGGGGEFFEEFEAFFEGEGGGGGGFFFGGGGGGGGFGGFFGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_730/1 99 chr17 28952007 60 91M = 28952369 453 AGAAGACATTGGAAACTGGAATAAAGGCCATCCCTTTTATATAGTTGCAAATAACTTGATGACACTGAGGACAATGGACTTTGTGGAAGGC GGGGGFGGGGGGGGGGGFGGGGGGGGGFGFGGGGGGGGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGEFGDGGBEGGGGGGGGEEFDDFF? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_730/2 147 chr17 28952369 60 91M = 28952007 -453 AAGGGAGGCAGGGGCTATTTACCAAAAGAATAGGAGAAAGACCCTGAAGGCATTTCAGAGGTCTTCCAGGATGTCCTTTCCATCTAGGGCC =DGGEFGGEGGGGGGGGGGGBGEGGGGFGGGDGGGGGFGGEFFFFFGGGGGGGGGGGGDGGGFGGGGGGGGGGGGGFGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_732/1 99 chr13 90914732 60 91M = 90915098 457 TCAACAGGTGAATGGATAAACAAATTGTGTTATGTCCATATCGTGGAATATGACTCAGCAATAAAAAGGAATGAACTATTGATACATGCAA GGGGGGGGEGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGFGGFEFGEGGEGGGGFGGGGEGGDGFEGEGFGFGEEDGGGGGBEGGGGFGEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_732/2 147 chr13 90915098 60 91M = 90914732 -457 AATCTGAGTAAGCTCTGTAGATTCTACCAATGTCAGTTTCCTGGTTTTGATACTGTACTATAGTTATACAAGATGCTTCCCCTGGGGGAGG GE?EACF?GGG=GGGGGGGGEGFGGGGGGGGGGGGGFGGGGEGGGFGGGGGEEEEEF-FFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_734/1 99 chr11 41351080 60 91M = 41351473 484 ATAGAACTTACTCTATTTCTTATTTAAAAATCTAGAGAAAGAATACTTTTCCATTTTTGTACAGCTATGACTATTCTAACACATCCATTTA GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGEDGEGDFFGGGEGFGFGEGGGGGGGGGGGEGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_734/2 147 chr11 41351473 60 91M = 41351080 -484 TATTTCTTAAGATATACGTATACATACTTACACATCTAAATGTAGTACACTGTAAAACTGTCAGTTAAATTTTTAAACTATTAAGAAACAG GGGGGGGGGGGGGGGGGGFDGGGGGGGFGFGEGGGGGGGGFGGGGGFGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:49T41 foo_736/1 83 chrX 85633676 60 91M = 85633309 -458 ACTTATCCTCTTAACAATTTTTGATTATATAAGTTTGACTATTTTAGATAGCCCATGTATGTGGAATCCTGCAGTATTTGTCTCTGTGACT CFE5FBGGGGGFGEEEEBE@EEEEFEGEFEGE?BFCF@GFFFFEFFGEGG?FGEGFGGGDGAGGEDGGEFGFGGGGGGEGGFDGGGGFGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_736/2 163 chrX 85633309 60 91M = 85633676 458 AACAAAAACATTTTGCACGTGTTGTCTTTATAAGAGTTAAAAGGATGTTCTCAATGTTTAAGAAAGAGAAAACAAAATGACAGTCGGGATG GGGDGGGGEGGGGGGGGGGGGGGGGFGGGGDGGFGGGGGFGGGGFGGGGGGFGGDGFFFGGGGGGGGEGGGGGGFGGGFGGDGBGFGFGEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_738/1 99 chr20 59294412 60 91M = 59294807 486 TCCCTCTCTGTGAGTGCCCTGGTGCTGGCTTCTTTTCCTCCCCCCTTCCTATGAATGTGCCCCAGATTCCATGTGTTAAGTATTGCGCTCT GGGGGGGGGGGGGGGGGGGFGGGGGFGGFGGAEGGC9EAEFFEEEAECEECEBCECFDABCEEEA=@C?C:AC@CBD:DC?@BD?6;@;>: XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:39T51 foo_738/2 147 chr20 59294807 60 91M = 59294412 -486 GAGGACCCACTTGCTTTGCGAAGCTCACTTGTCCAGCTGTTTCACACATGTGACCCCATGGAACCTGCCAGACCCAGCCACCGCAGCTTCT A?@?=AA@7AB5C?BC?C@?@A5AA?5BDEC=ECCBAEBEECC??@DCCB@E:EEE@@@D?E?EEEFFDDEEGGGFGGGFGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_740/1 83 chr4 13012572 60 91M = 13012210 -453 GGAAACTTAGATCACTGATTTTGAAAACTTAGTTTTTCAAGTATAAGCATTTAATAATGCTAGACATATTTCTCTAAACACTGCATTGGCA ECEE@ECFFFDEEEDEGBGFGBGGGGCFFFEFFEFED=GEFFGAGGGGDGGGGGFGGDFGGGFGGGFGGFGGEGFGGGGFGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:9C81 foo_740/2 163 chr4 13012210 60 91M = 13012572 453 CACATTCTGCACATGTACCCCAGAACTTAAAGTAAAATAATTTTAAAAAAAATTGTTAAGGAATTTGTTCGTTTAATCTAAATTGTCAATT GGGGGGGGGGEGGGGGFGGGFGGGGFGGDGFGEGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGAGDGGEGGGGGGGGGGEBDFGEFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_742/1 99 chr15 83690923 57 91M = 83691325 493 GAATTGCCAGGAAGTTCAGATTTACATAAAAAAGTCATAAGAGGAATGGTCCCCACCTTATAGTGGATGAAAGTTTCAGCTCTTCTTCCAA GFGGFEEGGGGGGBFFGGDGGFDGGGGEGGGGGGEGGGGFGCGGGGGGGGGEDGEFCGDEFAFCFDEEE?B:B:BC>?A-?>?B?>C=@>A XT:A:U NM:i:0 SM:i:37 AM:i:20 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_742/2 147 chr15 83691325 57 91M = 83690923 -493 AGCAGGCTATTCATAAGCATTTTAAATAACCTCTTTTGCATGTTTTATTTCCCCCAATTAGGTCAAACCTTCTCAAGGACAAGGATTATGT EE?=FE?GGFG?GFGGGEFGEGEFGFGGEFFEGDGEFFF;FGFGGGGGGEGGFFDGGFGGGGGGFEFDGDGDFGGEGGFGGGFGG?FEGGG XT:A:U NM:i:0 SM:i:20 AM:i:20 X0:i:1 X1:i:2 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chr15,+25952367,91M,1;chr15,-24828760,91M,1; foo_744/1 83 chr17 25425956 60 91M = 25425569 -478 GATCTTATATTTCCCCTGCTTTCCCCCTGGAATTATTCATCCCCCCAAGCCAGCATCTGGGTTCTACATGTGCTCATTACTCATGAGTCTC #########?=>+A@?:56981>(>>:.<8<,954902)&2&?B<;GDGGGDGGGGGGGGGGGGGGGGGGGFGGGGGGGGEGGGGGGFGGG XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:12T10A17T49 foo_744/2 163 chr17 25425569 60 91M = 25425956 478 GGCCTGGTTGACTCAGCATAATTTTTCTTGAGAGTCTTAATATGTTGCTACATGTATCAGCAATTTGCTTCTTTTTATTGCTGAGTAGTAT GGGGGGGGGGGGGGGGGGGGGGGGGGGDGFGGFGEGGGGGFDGGGGGGGGGGGFGGGGGGGFGGGGGBFGGGGGGG@GGDGG?EFBFFDFB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_746/1 83 chrX 88341525 60 91M = 88341149 -467 CTGCAAAATTTGCCATTTTCACTAGCAACATATGAGCATTCCAATGTCACCATATCTTCAAAAACAAATTATAATTTTTTTAATTAAACTC FEEEFEECEEEEEACEEEBE?DEEEEECFDFDEGEGDEEBFGGGGGGGEFGFGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_746/2 163 chrX 88341149 60 91M = 88341525 467 GCCTATTCCAGTCATTTCATGTAAATTGCTAAATGGACTAGATAGCCTTTTGTGTCTAGATTATTTTACTTCTTATATTTTTATGTTTATT GGGGGGGGGGGGGGGGGGFGGGGGFGGGGGFGGFGGEGGGGFGGGFGGGFGGGGGFEF=AADBDDEC3EDDADDADC:DDEB8@8CCC??E XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_748/1 83 chr17 52049237 60 91M = 52048859 -469 TAAAGCCCAGACTTAGCCCTCTCCAACTTTCCAACTTCAGGGGGATATACTGTCACCAAGAAGCAAACATTTGGGGGAAAAAAAATCCTCA FEEA@CCDCF=EEFEFEDEEEAAC@=EFEEFFDEGEDEEFCEFFEGECE:GGGGEGGEGGGGFGGGGGGGFGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:48G42 foo_748/2 163 chr17 52048859 60 91M = 52049237 469 ACAGATGAAAAAACCTGAGACCTAGAACAGGAAGGAATCCATCCAAGTTCACAAAAGGGCTTGAGAAACAAGAGTCCCAACTTGGCTTTCC GGDGDGGGFGGGGGGGGGFEGGGGEEGDGGGFGFDGDFGGGFGGGGGGGGGGDGGFEEGFDGGFFFFGGBFEEFABEFFEDFE5F=DBFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_750/1 99 chr18 23435820 60 91M = 23436230 501 TAACATATATAAGTGCGTGTGTGTGAGAGTGTGTGTGTGTGTGTGTGTGTGTATTTTTCAACCTCAGGAAATTCACAACATGCTGCTTTTT GGGGGGFGGGFFGEGFFFFFEEEEEBDADAEBEBEEEEECFEFDFDFEFDFBDFFGEDBBEEFEDBEBDCCDDEEGFFDDEEE?EDE?GEE XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:44A46 foo_750/2 147 chr18 23436230 60 91M = 23435820 -501 ATGATGGGATCCATGCCCTTCATGGCGCAATCCAGCCACTACCATCCCATGGCCAGGCAGAGGAACTAAAGAATAAGTACCAGAACTTCTC ################################DCDDECEECGGEFGGGGDFGEEGGGEEGGGGGFGGGGGGGGEGGGGEGGGGGGGGFGGG XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:15A7T7A59 foo_752/1 83 chr8 128794752 60 91M = 128794382 -461 CTACTGTGTATTCCCCCACTCTTTGCTATATTATCCTCTGCCACTCAGTGCCCACTTAATAATAGGCTCAATTCCAAAACACTGCATAGTT 7?::?AA+>CCC<.959BCBCB??AAD?E:EECEEEEGEGFGGGGEGGGEGGGGGGGGFGGFGEGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_752/2 163 chr8 128794382 60 91M = 128794752 461 CCTTGCCCAGCCTAGATTTGCCTGTAGGAAAAGAAAAACACTGTGGTGCAACTCTTGCCCAAGAAGTTTAAGTTTCTAGCTCAGTGAGACA GGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGFGGGFGGGGGGGGGFGGGFGGGGGGGGGGGGGGGFGGGGGEGGGGGGFGGBEDDEFDFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_754/2 129 chr9 125074118 37 91M * 0 0 GAGGCCAAGGTGGGCAGATCACGAGGTCAGGAGATCGAGACCATCCTGACCACCATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATT AFDEAEDED5EEEEEBEEEBEEDE=FFFFFFEFE=AE?E=DD:CCCEC=E=?DDDEEEA=:BAAA:AAAAADDDADEBEEGGFEFE?B>@E XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_755/1 83 chr6 163144161 60 91M = 163143791 -461 ACAGTAGGGATGTGGTTTGTTAGGAAAGGTTTGTGCTGAGTCTTGAAGGTGATTAATGGTTATTCAGGAGAAAAGGAAGGAGAAAAGAAAA EGGGGGFCBGGGGGGGDEEFGGEDFGFGGGEEGFGGGGEGGBCGGGGFGGFGGGGGGFGGGEFGGGGGGGGGGGGCGGGGGFGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_755/2 163 chr6 163143791 60 91M = 163144161 461 CTTTATTATTAAAAAACAAAAACAAAAAAAATGGTCTCTCTCACTTGTTGAATTTTATGGTATCATCCCTTTTCCATTTTCTTGCTACTAT FFGGGGGFGGGGGFGGGGGEGGGGGGGGGGGGGGBGFGGGDGGGEGGGGDGFGDGGGGFGDGGFGGGDAGGGGGGBFGGGGFD>GGGF?GD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_757/1 83 chr6 163144161 60 91M = 163143791 -461 ACAGTAGGGATGTGGTTTGTTAGGAAAGGTTTGTGCTGAGTCTTGAAGGTGATTAATGGTTATTCAGGAGAAAAGGAAGGAGAAAAGAAAA BGG?DGGGFGBGGEGGFGGFGGGEFCFGEFGFGBGGGFGEEEFGBGFFGEFGGGFDGDGFGDFF?FFFGDGGGGGFGGDFGFGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_757/2 163 chr6 163143791 60 91M = 163144161 461 CTTTATTATTAAAAAACAAAAACAAAAAAAATGGTCTCTCTCACTTGTTGAATTTTATGGTATCATCCCTTTTCCATTTTCTTGCTACTAT GGGGGGFGGGGFGGFGGGGGFFGGFGGGDGGFGGEFFFGGGGDGFEEFDEG:EGGGGGGFEDFFFFGFGGGGGBDGGDBGGGGEG=:@B:C XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_759/1 99 chr9 38084962 60 91M = 38085323 452 GAAGTGCTGGGATTTCAGGTGTGGGACACTGCACCTGGCCCCTTTATCTACTTTCTAATTCCTCCATAATATGGGCTTCCAGGGCCCTGGT CCC?:DDDDDFFFDBEDEE:DCDBD;EEEEEEDEDEFCFFFFBAEDAFFEFFFFFFB?FFDFFEFEFDDFCAAACFFFDFAFBFF=DDCCA XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_759/2 147 chr9 38085323 60 91M = 38084962 -452 AATGGCTTGGTGCTGTCCTCTTGGGAATGAGTAAATTCTCACTCTGTTCGTTCTTGCGAGATCTGGTTGTTTGAAAGAACATGGTGCCCTC FBBDGAFAFFDECEE5EEEEE=GGDFGDGGGFEFBFFFFEFEAEEEBDFF=GFGEGDEDE=GEGDGGGAGEFGGGGDEEGGGDGEGGGG=? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_761/1 99 chr17 29053884 60 91M = 29054252 459 TCAATCAATGAGGTTAGCTATATAAGTGTCATATGTTGCCTCCCCTGGAAGTGAATCTAGGAGATTTAATGATTCATTCACTCAACAAATA GGGGGGGFGGGGGGGFGGGGGGGGGGEGGGFGGGGGGGGGGFGGGEEGEGGEGFGFGFGGFEGEEAFEBFGEGBEDBEEEGCGGGGBEGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_761/2 147 chr17 29054252 60 91M = 29053884 -459 GGGATAGCAAGTGCAAAGTTCCTTTGGTTTGTTTGAAGAATGGAAAGAACCTTAGGATACATGGACGCCTAGAACAGAGTGAGCAAGGGGG GDEAGGEDGGGGGFGEGEFFBGGGFGEGGEGGGGGGGGGGGGGGGGFFFFFG5GGGGGGGGFGGFGGEGFGFFEFFGGG=GGGGGGGGFGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:52C38 foo_763/1 83 chr17 63954616 60 91M = 63954230 -477 GGCTGCAGATAAGACTAAACGCAGAGGGAAACAGAGAGAAAACACTGGGAGGGAACGAGGATGCACTCTTAGTTCGTGTGAGAGCATGCTT GEEEGFGGGGGGGFGFEGFFDGGGGEEGFGEGGGGGFGGFGEGEGEGGGGGGGGGGGGGGGGGGFGFGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_763/2 163 chr17 63954230 60 91M = 63954616 477 TACCTAAGGCTCTGTCGTAGGTCTGAGACTGGGGACTGAGGGTTAAAAAAATTCTAACAGGTTCTCAAAATTGATGATAAATATGAGTGTT GGGGGGGGGGGGGGGGGGGGGEGGGFFFFFGGGGFGGDFFGGGGGGGGGFGGGGGGGGGGGDGEDGGGGFGGGGGF=GGGGFGFGDECE@A XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_765/1 83 chr4 77472544 60 91M = 77472185 -450 ATTACAAGGCCACATATAAAATATTTGGACAATTAATTTGATTCAGAATCTCAGATTCTTCTTACCTATGGCACATGTCTGATCTCAGGCT GGGEEEFGGGGFFGGGGEGGFEFAGEGGEGGGGGGGFGGGGGGGGGGGAGGGGGGEGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_765/2 163 chr4 77472185 60 91M = 77472544 450 GAATTCAGCTATTCAATTATTGATGAAAATAGGATGGTTTTCAGTAAGGTGCTTATAGTCCTAAATTGAAGTTTTTACAGTTTCCTTCTGA GGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGEGDGGGGFGGGEGGEGGGGCFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_767/1 83 chr4 63113300 60 91M = 63112917 -474 AAAATGGTGAAACCATCTGTACTTTGTGCACGCTGATGACCTTGAATTACTTCTAGCAGTGTCATTACCCAATTAGTTGAGAGAGGAAAGA BGGFFGGGGEGEGFGGGGEGEFGGGGGFGDGGGGGGGGEGGFGGGGGGDFGGGGGGGGGGGGGEGGEGGGGGGEGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:49A41 foo_767/2 163 chr4 63112917 60 91M = 63113300 474 GCATACTCCAGTAGATTGTACACCAGGAGGCAGATCAGCAATACAATGTATTTCAGGTTGAAATTCTCACTGGGACCATAATTGAAACTAA GGGGGGGGGGGGGGGGGGGGGFGGGGGFGFFDDGGGFGFGFGGGGGGGGFGGGGGGGGGGGEGGGGGGGGGFGGGGGGAGFGFFFGEGEGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_769/1 99 chr4 85534371 60 91M = 85534729 449 TCATATGGTATACAAAGTAGGATTTATCAAATAAATATGGCTTTTTCTGTGTGGAGGTGTTTGTTTTGTTTTGTCTGTTGGGTTTTAAGGG EEEEEEE?EEEEEEEEE=EEEAEEEEEEEEEEEECAEEEEEEEEEE:AA=B?B5BBE?B:>A?><@A;B@>93,7755:16@7>@DAB?AC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_769/2 147 chr4 85534729 60 91M = 85534371 -449 TGACACCGCATCCCTGGTAGCTTAGTCATGCTGGATATTTCCAGTGAGGCTAAAGAAAACCACTGCATCTTAGAATAGCCTGTCAGAGAGA @EAAACFFFFBBFF:FGG5GGFF?EBGFGBGGGGGGGGBGEEAEEEEEDBCFFGGGGGFFFGGGGGGFGGFEFFGFGGG?GGGGGFGGFFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_771/2 145 chr8 28899072 37 91M * 0 0 AGATCACTGCCTTTTTAACTTATCTTTAAAATGTAGGATAAAACCACTGCATATTCTATTTAATTATAGCAATTTCTTTACAAAGGTAAAG FG:GGGGEFGFGDEBGGFGGGGGGFGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_772/1 83 chr3 153472567 60 91M = 153472200 -458 ATATAACAAATGTGGCTAACATCGTATTTATTTTGGACAGTGTTGTTCTAGGATAAAGCAGGAAAACAATATTCTGGGAGAAAGCTTAAAG GEEE;EDFDGGFFFBFEGEBEGGDGGAA?EB?EFFFDDCGGEGGGGFEF:FGGGGFGBEGGCFGGDFEFBFEDEGGFDFFFGGEBGGFGEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_772/2 163 chr3 153472200 60 91M = 153472567 458 ATCAACCATATGGATCATAGAGAATGTGACATAGAAAGATTTTAAAGTGTATGGTTTTTTTAGGCCAAAAAGCTTCATATGTTTGGAAATC GGG?GGFGFGFDFBFFFFFFBE?EEGFFGEGGBGGGGGGGGFGGGDFEEDEBDEEGGEFBG4A8B92A,3;7(/>76*68(@@@@C XT:A:U NM:i:4 SM:i:25 AM:i:0 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:46T10C7C20A4 foo_775/2 129 chr6 124858504 37 91M * 0 0 CATTATACTAATTTTCTTCAACCAAAGAAAATTCCAAAGGATTCTAAGAGATGACATTTCAGAGTAATTGTCACATTCTACCCAGCTTGGA :A??:CCCA=?AACA?>=:?DDDB=AAAA:@>DDDEFADD>6?@?;;5===CCCA@@;@?BD?DDEE5?5DD?DD=B?EEEAE:E6>@6=B XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_776/1 83 chr6 124858894 60 91M = 124858504 -481 TTTGGCATTTCCTTTTTTCCAATGTTCTCTGAGCATGCTATCTCAATTTATTATAAAGCTACACATACAGCCCAGGAATTTCTTTAGCAGC GGFGDGGFEGGGGEGGEEFGGGGGGGFFGGGGGGGFFGGGGGGGGGGGGGFFGGGGFGGGFGFGGGGGGFGGGGGGGGGGGGFGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_776/2 163 chr6 124858504 60 91M = 124858894 481 CATTATACTAATTTTCTTCAACCAAAGAAAATTCCAAAGGATTCTAAGAGATGACATTTCAGAGTAATTGTCACATTCTACCCAGCTTGGA GGGGGGGGGGDGGGGGGGGGFGGBEGGGGGGGGGGGFGGGEGGGGFFFEFGEGGGEGGGGEGGGEGGBFFGFGGGGGGGGEDGDGEGEBBE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_778/1 99 chr5 157868546 60 91M = 157868917 462 ATTTAAACTCCTTAAAATGTGCCCAGTGCACTTTAAGGAGAGTGTTGTGATGTACCAGCACATTTGAAATTCAGGCATTTGTGAAAACTTG FFFFFFFFFFFFFFFFFFFBFFFFBEBDAEFFFFFFFFBFEDEEEEFFBDFFEFFFFFFDFAFFFFDEFFEFEADFFFEFFFFFFF=EFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_778/2 147 chr5 157868917 60 91M = 157868546 -462 ATAAAGAAAGGCTCTGCCATCGATAGGTTTGCAGTCTAACAAAAAACATAAACCTCAAATTAAAAATGTATTATTTTATTAAACTTTTGAT FEFGCEFEEGGGFGFGGGGFGFFDFFEE-EEBGGGDGGGGGGGGGGFFEFFFDFEFGGGBGGGGDGEFGGGGGGGGGGEFGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_780/1 83 chr15 92374971 60 91M = 92374581 -481 GTGAGAAATAATATCTCCATTTCACAGAAGAAAACATGAGGCTCAAATCGTTAAGATGCATTTCAGACTCATGCAGTTTATAAGTGGTGGA EGEFGDGFFGDFEFDBEFE5DEGEGFEGEFGEFDEGGGEGEGGGEFFGGGGGGGGGGGGGGFFFFFE5EEEGFGGGGFGGGAEGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:67T23 foo_780/2 163 chr15 92374581 60 91M = 92374971 481 TCAGAAGTTTGGAGGTGTTGCTTGAAGCCTTCACTGAAGAATTGCTTCTGCCATCTAGGTTTGTTTTTAAATAACATGTGATTAGACTTGT EEBEEEEEFFEEEEE:EDDDEEEDEEFFGGGGEDFEGDGG=FEFEFFFDFDFEDFEDADEGDAGFFGG-AACCBAAEBB:E?:BBEDD:BBB=AB@D4?CC<:>5:13<2> XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:45T45 foo_784/2 147 chr10 27302980 60 91M = 27302604 -467 ATTGGTATGAATGTTACATACTCTATTCTACTCTTTTAGTAATTATCCTAAAATGATACCCTGAATACCTGAGTTAATAAGACCTAAATTG BFDBFDADDDBCBBFFF?DFFAFDFF?EEBDBDGEGDGGFFGGGGGGD=DGG?GGGEAADEDDEEEEFFEFDGGDDFFFFF=DFFAGGFGD XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:39C51 foo_786/1 83 chr3 99767311 60 91M = 99766946 -456 TGTGTTACAGCCACTTGTCTTACTTTTATTTCATTTTTGTTTTTTTTTTTTTTCTTCTATCTGAAAGAAAGCTAGAGAGTTATTTCCCCTC #######CCBCA=?AA==??@=0;?BA:BFDC;9@>CC4?CCGA;AFGGDGGBGDGGEGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGDGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_786/2 163 chr3 99766946 60 91M = 99767311 456 TGATTGTTTGTTTTAATGTCTTCAGTGTGGCTTTAACTGATTTCCTGAATACCTAATGTCCACATAACACTTAACAGCTTGTAAAACTCTT FGGGGGEGGGGGGGGGGFGGGGFFGEECEEFFFFFEDGFEGGFGGGBGGFDFGEDGFGDGGED=EGGDDFGGBFD?CCCD.;:?BEB:=?D XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:27C46A16 foo_788/1 99 chr5 158486218 60 91M = 158486590 463 GTTTGCAGGCTTTTAAAAAATTTTCCTATCAGCACTTTGAATATAGCTTTCTACTGCTTTTCTGGCCTCCATTCTTTCTGATAGGAAATCA EEEDEEEDEEFFGGFGGGGFGGGGGGDGFGDGFFFGGBGGEAEEEGDGGDDGGGFFFBGDGGFEGBFEDFFEDFDFFFDGAFEGBGFDEEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_788/2 147 chr5 158486590 60 91M = 158486218 -463 TATTATTCCACAGGTCTATAGGATTGTCTATTTCTCTTTCATCTTTTTTCTCCCTATTTTTCAGATTCGGTAATTTTCACTGCTCTCTTTT ?EFEFDEFF=FGGDBEGGFGG?FFDGDGGDGGGFDGEEEE>C>>.DDDDCBBBEFDBBE?AEFEGFFFFBAEEEED@B?DDC=C@GGEGGGGFDGGGDGGGFGGGGGGGGGGFGGGFGFGFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:10 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_794/2 163 chr19 34534058 47 91M = 34534438 471 GCTGTGTGTAGAGGGAAATTTATAGCACTAAATGCCCACAAGAGAAAGCAGGAAAGATCTAAAATTGACACCCTAATATCACAATTAAAAG GGFGGDGGDFGAGGGGDGGDGD=FGFBGFEGGDGGGGGGFGDFDGGDEDGDGGGFFGFGEGGGGGGGFGEGEEGGGGGGGGGDGEGDEGF? XT:A:U NM:i:0 SM:i:10 AM:i:10 X0:i:1 X1:i:18 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_796/1 83 chr12 80216455 60 91M = 80216087 -459 CGGGCATATCCTAGCAGCTACTAGCAGCTCACATAGAGAACTGAATAGAAGGGGCAGTGAACAGTACTGTTAAGTGACATGAAACAGCCCT ###########@A:BABEE@BFD?EEEFFEEBEFFFED?FFDFFDDFFFFFBFFFFEEEE?AFFFFEFFDFEEEEEBEBEEEEBEEDEDEE XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:1A89 foo_796/2 163 chr12 80216087 60 91M = 80216455 459 AGCATTTTGTCTTGTGATTGATTTTTTAAATATAGTAATTTTAAAGACACGGTTGAAAGAAGCAGCTGATCGCTAATATCTGATGGCTAAT FDGGGFBFFBEEEE?FFEDFDDEFEFEEDEFFAFFGGEGGGFGGD=DDADFDAD=ADDEEGGBEFGGFDDADEDEDE?EDD:5C=DCBEE> XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_798/1 99 chr2 185454724 60 91M = 185455086 453 CAAATAACTACCAGAACTGCCTTACAGGAGGTCCTTAAGGGAGTGCTAAACATACAGGAAACACTATTATTAGCCACTACAAAAACACACT BDDD=CC>CC>AAABDA5D=DBDDAAAAA?A?C:BDADD:B:-?5@@@C@D;BDC==@CCDDDDD5BAB:AA-AC:?AA?-CAC:@>CDD5 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_798/2 147 chr2 185455086 60 91M = 185454724 -453 AAAACAGAAAAAAGCAGGGGTTCCTATTATAATTTCAGACAAAACAGACATTAAAACAACAAATATCAAAAAAAGACAAAGAAGGATATTA GC@?6:=CDDCA>5:ACB?;?5=A5C=DDFFCCC=A@=4-@FF?FFDGGFDAGGFGFFFBGGFGEDEEEEEACCCA?>E:DFDFEF?DEEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_800/1 99 chr5 175546065 60 91M = 175546449 475 CTCAGTGAAAATTTCCATCACCACAGATTCATCACAAGCTTTAAGAACTATCAAATCAAATGGGCACTGATGAGGTATAAAAACTGCATTA GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGFGGGGGFGFGGGGGGGGFGGGFGGGDGGGFGGFGGGGEGGEFGGGGAGDGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_800/2 147 chr5 175546449 60 91M = 175546065 -475 GAAATTGTTTTAAAAAATACAGAATACAGTAAAAAAATAAAACCATCTCACTGGAGGAACAGCATTGTAAAGCTACTTTTAAAAACGCAGC EEGEEEGFFB@FFGGGGEEGGGGGGDGGGAGGGGGGGGGGGFFFFEFEFEEGGGGGGGFGGGGGGGGFGGGGGGGFGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_802/1 99 chr8 15132879 60 91M = 15133251 463 TCTAACCAGATCAGCACCTTTTCCACTGCACAAATGAATTTCCTCAAACACTACAATCTGTTAGATTTACAGTGTGAGCTTTAGTACTTTG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGFGGGGGGEGFGGG@GGFGGEDDFDGEGG?GFGGEFGGGGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_802/2 147 chr8 15133251 60 91M = 15132879 -463 TGTTGTTTAGGAGCCTCAAGAAAACTCTCAAGGCAATTTGGAACAATTTTTAATTTTTAGAATGTGAGGCTACATAATGCTCTTGTAGGGG :BGFDGDG=GGGGGGGGCGGGGDDGFGGGGGGEFGGGGGGGGGGGGGGGGGFGGGGGGGGGGFGGGGGGGFFGGGGGGGGFGGGGGGGGGF XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:85G5 foo_804/1 99 chr4 181669501 60 91M = 181669877 467 AAACTATTCTATATATTCTTCTCTAACAGTAGAGGGAATAATTAAAAATATATTTTAAAATATTAACTTTGTTAGTTTTTTCTTGTCTTTG FEFGGFGGGGGGGGFGGEGGGGGGFGGFGEFFFAFAGGFGDDDGGGGDDGCGGGGGFBGFFGGGFDGGGGGEGEEEFFEFFGGGFGFGFDD XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:90C0 foo_804/2 147 chr4 181669877 60 91M = 181669501 -467 CTGCCAATGCAAAGGATGTAATTCCCACTAATGCCCTTCAGTAAGGCATAAAAAGGTGATTGACCAAGCAAAACAATCGGCCAGCTTCCTT D:FDGDEFGFGFGFBEBFFFD:GAFGGFGFFGG?GDGGEGGGFEEFG>EGFGGGFGEDGGGGGGGGGDGGGDDGDGGGGGDDEGGGGDDGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_806/1 83 chr10 120158272 60 91M = 120157889 -474 AAGCAACTTCGTTTTTACCCCCACATAGGTAGTCAGCAACACAATATCTTGTAGTTGTGAGTGGTTAATTGAAGGATAATCCAGTAACACA GGBGG@GGGGGEEEED8AEEEFEFGGGGGFFFEDGGGGGGEGFGEGFGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_806/2 163 chr10 120157889 60 91M = 120158272 474 AGACGAGGGCCTGTTTGGGCTCAATCATTAGAATCCAGGTTGCTGGAGAGGTTCTGTGAATCTCAAGTGGAACTTGGCAAATTTCAGCAAA GGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGFGGGDGGGFGGGGDGGGBGGBGGFEDEBEDCDDDGGEGGEGGDGGGFFGGGGFGBGEGGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_808/1 83 chr10 120158272 60 91M = 120157889 -474 AAGCAACTTCGTTTTTACCCCCACATAGGTAGTCAGCAACACAATATCTTGTAGTTGTGAGTGGTTAATTGAAGGATAATCCAGTAACACA GG=DGDEAFFBEEEEB3CFFFGEGFGFGGGGG?GGGGFFFFEGGGGGGGGGGGGGGGDGGGFGGGGGGGGGGGGGGFGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_808/2 163 chr10 120157889 60 91M = 120158272 474 AGACGAGGGCCTGTTTGGGCTCAATCATTAGAATCCAGGTTGCTGGAGAGGTTCTGTGAATCTCAAGTGGAACTTGGCAAATTTCAGCAAA GGGGGFGGGGGFGGGGDGGGGGGGFGGGGEGGGGGGGGGEGFGGGGGGFGGEFGGECEAEDBDDDGGCGGEAEEEEGEGG?FFFBEGEGE? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_810/1 83 chr11 35423603 60 91M = 35423216 -478 GTCATAAATGGCTTTTATTTTGAGGTATGTTCCTGCTTTACCTAGTTTATTGAGAGTTTTTAACATGAAGGGATGTTGAATTTTATCAAAG GFFGCEGGFGGGGEGFGEFGFGFBFGEGGGEBEEE?GBDDGFFGGGGGGFGGGGGGGGGGGEGGGEGDGGGFFBFAGGGGGGGGGGEFGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_810/2 163 chr11 35423216 60 91M = 35423603 478 CTTTGTGGCAATTGTGAATGGGAGTTCATTCATGATTTGACTCTCTGCCTGTCTGTTGTTGATGTATAGGAATGCTTATGATTTTTGTATA GGGGGGGGGGFGGGGGFEGGFFDFBEEEDEEEFDC?EDDEGFFEED=EEDBFFEFGGEBGF=F=?5(@# XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_812/1 83 chr3 10031980 37 91M = 10031630 -441 AAAGGCTGAGCTGGAGGGGCTGGGACACAGCCCACCTTGACGACAGCTGCTCCTGGCTGTGCGGGGGGGTCTCCAGGAAAGGCCATTCTCC ################@>==@222531>84@(6.A1:BA>AB@EBEFFFDFCFFFEDDD>><@ACC=D=ADEEADDBEEEEDFDFFEFFEE XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_812/2 163 chr3 10031630 36 91M = 10031980 441 AAAAATATAAAAATTACCTGGGCGTGGTGGTGGGTGCCTGTAATCCCAGCTACTTGGGAGGCTGAGGCAGAAGAATTGCTTGAACCCAGGA ABAAA?@@DD?DBABFABFFEFFDF@@,B@1=A>*>B=B>?DBDDABCAAAFEFEEE5-ED-DD?CDACD##################### XT:A:R NM:i:1 SM:i:0 AM:i:0 X0:i:2 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:89C1 XA:Z:chr3,-126146230,91M,1; foo_814/2 129 chr19 32423845 25 91M * 0 0 TGGATATTCAGACCTCCTTCAGGCCTTCGTTGGAAACGGGATTTCTTCATATTCTGCTAGACAGAAGAATTCTCAGTAACTTCCTTGTGTT GGGFGGGGGGGGGGGGGGGGGGGDGFGGGEGGGGGGGGGDGDGFGGGGGEGGGDGGGGGGGGFGGGGGGGGGGFGG=GGGGGGGGFFDFDG XT:A:U NM:i:4 SM:i:25 AM:i:0 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:16T2G27T5A37 foo_815/1 83 chr9 3436756 60 91M = 3436381 -466 GGTACATTTTGGTTTGAGGGGCTTCTTTTATCATCTATTCTCTCACACAGACTTAAAATGGAAATAATTTGGGTTCTTGCTGGACACTGCT BEECEE=GEGGGGFGGEFDDEGGEGGGGEFDGFEFGFDEGFFEFEFGGGGGGGGGGGGGGGGGGGGGGGGGGGGBGFGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_815/2 163 chr9 3436381 60 91M = 3436756 466 GTAAAGGTAATATATAATTTGATGCAAGAATTAATTCTCAAGAATGTAAAATGGTTGGAAACCAGGAGATCTGTGAACTACTCTGATGACT GGGGGGGFGGGGFGGGGGGGFGGGGGGGGFGGGFGGGGGGGGGGFGGGFGGGGFDGGGGGGGGGGGGGEDFFFDFGGEGFGFGEGEGGFEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_817/1 99 chr5 126398937 60 91M = 126399316 470 TGGATTATCCCTTAGAAATGAATTGTATCATTACTGATTATGAATACACCAAATTATGACAGATTCACATGTAAGAATGCCAGGGAAAAGT GGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGCGGFFGFGGGGGGEGEEGEDF?BE@CECFFE?EGDFGGAFFDB@ XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_817/2 147 chr5 126399316 60 91M = 126398937 -470 AACAGGCAAAGTGGTTTGTCCAAGAGCAAAGCCAGGGGCAGGGTGAGAAATTGTTCATTCATTCAATCATGCATCTATTCTTGGCAAGGCA GEGGGFGFGGFFGGGGGGFGEGGGGGGGGFGGGGGGFGGGGGGFGGFGGGFGGGGGGGGGGGGGGGGGGGFGGFGGGGGGFEGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_819/1 83 chrX 88036008 60 91M = 88035630 -469 ATATAGAAAGTAGAGCTATATCAAAAATAAATTGGATAGATTCACAGGGTGACAGCTCATAGGACATTTTCAGGAATAATATAGTACTTGC GEGGGGGGGGEFFFFFBEEGEGGEGGGEGGFEGGGDGFGGGEGEGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEEFFFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_819/2 163 chrX 88035630 60 91M = 88036008 469 TGTGTGTGTGTGTGTGTGTTTGTGTGTATCTTGCAAATTCTAGGAAGGCCCTAGATTATCCTGCTTGGACCTCACTGAAGGAACTGATTTG EEEEADDDDDEFEEDEEBEAEEEEEFFAFDEDEBE-BCDDACDDD-CCCCA?C=CCEAECCCEB?5C@@BFFFECCE:CC0,534??B=AE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_821/1 83 chrX 120359247 60 91M = 120358878 -460 GATTTCAGTCTCCATTACATCCTTTGTCCTATCCTGAAAGTATGCTTGAATAGCTTTGATTAAAAATGAAATCTTCTCTTGGTGCTGGATT ?9?7?;=-@DBC@CB>ADA=DEBC=BEBEEEEEBBEEEECEEEEDEEEDEEEEEEEE:EEBEEE?EDDD:D:DEDEEE:DE:EEEDDE5BB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_821/2 163 chrX 120358878 60 91M = 120359247 460 AAACAGAGTTTCATTCTGTCACCTAAGCTGGAGTGCAATGGCACAATCTCAGCTTACTGCAGCCTCTACCTCCTAGGCTCAAGTAATGCCC GGBGFECE5=CCCCCGGEFGF=FA=EEAEECEE5BB==BC?EEEE?DC:DBE?EBBC?CCECEAED=5AC@@?8BBB=EECCBCC=A=@;A XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_823/1 99 chr12 20053844 60 91M = 20054214 461 ATGTTTACATTCATTTTCTATGTGGTGCTGCTCTTTTTGCAACTCTTCTGATTTGATATACATTGATATTTTGTACAGAATTGGTTCCTTC GGGGFGGGGGGGGGGGGGGGEGGGGFGEGGFGDGGGGGFFGGFGGGGGGADGGGDFGGFAGFGGGGFGG?FFFEDFEBFFDFFFB?AEBDB XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:19G71 foo_823/2 147 chr12 20054214 60 91M = 20053844 -461 GAAAGGGACCCAAGCTGGTTGCCTCTGCTGGCTCGGGTGGCCAGGTTTTATTCCCTTAGTTGGCCCCGCCCACATTCCGCTGATTGGTCCA E??:CCA5A?BE?:EEB=CC??CBE:EFDGEGFFGGF:FBGGGGGGFFFECDGGGGGGGGFGAGGGGEG?EGGAGGGGGGFGGGGEFGFGG XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:58T24C7 foo_825/1 99 chr18 50458738 60 91M = 50459109 462 TCACTCTGTTGCTCAGGCTGGTCTCAAACTCCTGGCATGAAGGAATCCTCCAGCCTCGGCTTCCCAAAATGCTGGAATTACAGGTGTGAGC FFFFFFFFFDAEAE=FBFFFFFFDFFEFFBBFAADDEF?FEBFFEFFFADEBB?EEBBCBF?DBE5@@A58:4.?>A??AD5:D?EBEDEDEECECAAC==CEBEFFFFDEEEEDF>FFAGGFGGGFGDGGFGGG XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:0G2G0A0T85 foo_829/1 83 chr1 220333851 60 91M = 220333460 -482 ATCAACGGTTGCCAAGTGATTCCAGCCTATGACCACCAGTGGCCAGATATTGATTTCTCTTGTATTCAGCAGCAGTTGAAGTTATGCATAT ######################BCBCCCECC2FFCFGFGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:21A69 foo_829/2 163 chr1 220333460 60 91M = 220333851 482 CTAAGAATAAAAAAATTTTTTTTTCATTAATGACAAACAAAAGAAAGAAACTTCAGCATATGGTTTGCAGCCATCAAATGGCAGCCTTTTT GGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGDGGGFGGFGGGGGFGGFGGGGEGGGFGFGFGFGGBGGDGGFEGEAGDDGGFE5B?BADE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_831/1 83 chrX 134261126 60 91M = 134260743 -474 ACATATAATTTGCACATTTTTCTTCCATTCTGTGGATTAACTTTCAACATTCTTGATGGTGTCCTCTGCTGCACAAAAGATTTTAATTTTG EFGFBEFEEDGEDDFC?96? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_833/1 83 chr7 109887272 60 91M = 109886912 -451 ATTTTGGTGTGTTTATTGGCCATTTGGTTTTTTGTGAATTGATCACATTTCTAGTGGTTTGCTTCTCTTGTCTGCCTTGTCTTTATGTTGT DB>@,=CECD?8?=?=>9.>=@=>A?<>8DD,D?EEDC?DCB:@2?CAB-DEGGGGFAFBFEE5EEEAFDEDGDFGAGGBGGGGGGGGBGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:19T71 foo_833/2 163 chr7 109886912 60 91M = 109887272 451 TAATTCATTTGTTCATTTAAATAACTAAAATAAATCATGGTTTGGATATTTTATAATTTATTTAGGCATTGCCCTAGCACTGAGCAACATA EEDEEEFFFFDDDB:DDDCBE=EEEEGAGEGFA@C5:AABCEED?@@,;@CCACA>CB?5AC6@9--93+:<5BEEE==E XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_835/1 99 chr14 104525651 60 91M = 104526025 465 ACTCAGCCTGCGAGCCTTTGTCTTTTCCAAGGTAAAGTGCTGTGCTTACTGTAGTATGTATAATTTCTTTTTTTTTTTTTTTTAGAGGGGG GGGGGGGGGGGGDGGGGGGGFGGGGFGGEGGGDGEEE@EEGGFGEGFAGGGFEGEECDE?ECB=E?FDGG=CC@>CCEEE?########## XT:A:U NM:i:3 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:82G3C2A1 foo_835/2 147 chr14 104526025 60 91M = 104525651 -465 GTATAATTTCTTAACCAAACTGGAAAACAGCACTGACATTATTGGGTGGTCATCTTTTTTCATGTGTCACTGATGAAGTTTGAATGTTTTG ###############DDBDDCA'A;;55CC:=ADDDBADDD5ABDDBDDEDCG?GGEBEEEEECDFEFDFDGGFGGGGGGGGGFFGDGFGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_837/1 99 chr8 29126628 60 91M = 29127016 479 TGCTGTGCATCGTGGGGTGTGATGGAAAGAACATGGCTTAGGAGCTAGGCAGACTTGAATTCAAATCTAGATTCAATAAATGCTTGTGCTC GGGGGGGGGGGFFGGFEDFFFFFFFEFEGFGGFFGGGFGGGGEGEGE?GEFF@BEEE?B=CCCCCEBEBBCDEDDEEC=EBACAC=03;>? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_837/2 147 chr8 29127016 60 91M = 29126628 -479 TGCTCCTAGACTACAAACCTGTATGGCATTTCCTGGAGGCAACTGTAACACAATAGTAGGCATTTGTGTATCTAAACATAGAAAAGGGGCC #######?C=C81?=9-=CAC:5D:?=E?E=FEDFEE?GDFDEEEDEEBCEGEEGDFFFDF;GGGGGGGFFAGFFGFGGGGGFGGDDFGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_839/1 99 chr16 9242042 60 91M = 9242386 435 CAGTCTTTTTTTTTTTTTTTTGGTCTTTTTGGTAGTGTATTTTTTGAATAGATTTTTGGGAGTAGGATTGCTATATTTTGCAGGTTTGTCC FFFFFFFFFFFFFFFFFFFFECB5=CCCC=8>64?380*>CCC@:0:BAA>A?############################### XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_839/2 147 chr16 9242386 60 91M = 9242042 -435 AACCAGCCTGAGGCTCTCACCAGGAGCAGATGCTGGCGCCATGCTTCTTGTACAACCTGCGGAACTGTGAGCCAAATAAACCTCTTTTCTT #####################ABA5BBEGGGECEE5EECEE:BEDEDB?BBBEEEAGADFGGAEGGBFGGEGGGFFGGGBGDFGGGGFGGD XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:2G88 foo_841/1 65 chr16 73154720 37 91M * 0 0 ACATCAGAGGTGAAAAAGAACACTAGTGCTTGGAAGCCTTCATCATTCTGAATGGCCTTCATTGTTAGGTCACTTTTTCCAGAGTTTGGAG FEAFFECB?:?DB=DEEE6CFFDFFFB?FFEEEBEAEEEFC5C@CFBBDEE?BBE:?@B?@AEEB>*?<117)24AA.8?8<66:=DABDCC;=CBE?E=EA XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_842/2 147 chr17 10325200 60 91M = 10324804 -487 AGCAGATCGCATGCGAACTCACTCACTATCACAAGAACAGCATAGGGTAACCACCCTTATTATTAAATTACTTCTCGCCAGGTTTCTCCCA ################?B,@>=5DADAA>>?7=/7*@5:CC=E5FE>B:9=A-AACABAA,CCCCADDDADDFEEGDG:FGDEFDEFDFFF XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:1T6T4A46G30 foo_844/1 81 chr4 8067024 37 91M * 0 0 TCTTCTCAGAATAATCTCTATCTCTACAATTCACATTGAATTGGAATGACATTGAGAGGAAGAGAGAGAGGCAGAGGGAGGGAGAGAATGT E=C??C:FCGFEE?CE@C-:>C>:>=@C@?:FDFEEGGGGBEGEGGGGDFGGGGGGGGFGFGG?GGGGFFGFDGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_845/1 83 chr2 50160845 60 91M = 50160468 -468 GGGACAAGGAGAGAGAGAGGAAGAGGCCACATTCGCAGAACTTTTATTATGGCATATTGTTACGATTGTTCTATTTTATTACTAGTTATTG EDCDEFBGEFFGGGGGEFEEEEEEEEFEEFGEGGGGEGGEGFGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_845/2 163 chr2 50160468 60 91M = 50160845 468 TACATGCAAAGGCCTTTATTAACTTCTTCCCATAATAATGAACGGTAATATGGTAGTCTCCTCTTATTCTACAGTTTCAGTTACCCACAGT GGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGFGGFFFFFGGGGGGGGGGGGGFEGGCGGFGGFGBFAFBFGEAGEEDGBFDFDD6 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_847/1 83 chr1 60434645 60 91M = 60434256 -480 CATCAATAACACAGTTGACACATATTTTGTGTGTTACATGTATTATATACTGAATTCTTAGAATAAAGTAAGCCAGAGAAAAGAAAATGTT :4AAB:8;6:=,37)2?;;?;*<636<3>>8><;+277<>B<>8/;41?2=8BA7,<< XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_847/2 163 chr1 60434256 60 91M = 60434645 480 TGTGAGAAGTAAAATACATTCAATTCAGTTAATTTAGAATATACACTAGTCTTATTTTGACTTTTATTTGAATCTAAGCTACATCTCAGTA BDBB:A:AA-7<7,7>+2:60,7763++3,767;@:.=??5;959267,@38>76?:8+:<3>5;(21.(>B??################# XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:13T2A67A4A1 foo_849/1 83 chr1 58484140 60 91M = 58483770 -461 GGCACATCCTGCACATACAGTATCGTATAAGGTATGAAGATGATGTACATGAATATATATTGCAAGTTTCCTGCATATAATCTAGTTAGGA ?040-*@DD@B=@AC@=BAEFGEAFGEFDE=GFGGGFGGGBGFGGGGGEFGGGGGGGFGEGGGGGGFGEGGGGGFGGEGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_849/2 163 chr1 58483770 60 91M = 58484140 461 GCATATGAAGCCGAGTTAGATGCCCTGCCTCTGGGTGCTCACAGATCCACTAATAGGGTAGAGTGGAAAAAAGAGAGGCAACAAAATGAAG GGFGDGFGEGGGGEGGFGGGGEEGGAGGG=EEDEBEFGGGFFEFF=FFEEFEFGGEFDDFEEEBBEEEDEE:EAABEFFDDFDFF>;DA?7 XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_851/1 83 chr9 98951549 60 91M = 98951179 -461 AGGATTCAGGAAGCCTCCCAATCATACCAGAAGGCCAAGGGGCAAGTAGATGCTTCATATGGCAGGAGTAGGAGCAAGACTGAGAGAGGAA 1;:994>=?9@B,FEBFEDEFF=BGCFFEFFGFEGEGBGEDG=GGGGGGGFBGGGGGGGGGGEFGGGGFGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:4C86 foo_851/2 163 chr9 98951179 60 91M = 98951549 461 AACAGTTATTTTCAGTGGTCTAAACTTTTCCTCATCTTGCTGTCTTCTAAGGGTTCCCAACTCTCCCGACCTCTCTCTCTTTTACCCACTT GGGGFGGGGGFGGGGGFDGGGGGFGEDGGFDEGFFGDFDGFEGGGED:EEEEEB?DDEEEDFG?GCA6?@EAEEEG=EEDCDFBDFBED?D XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_853/1 99 chrX 119574899 60 91M = 119575275 467 ATTCTCCTGCCTCAGCCTCCTGAGTAGCTGGGATTACAGGCATGCACCACATTGCCTGGCTAATTTTGTATTTTTAGTAGAGACGGGGTGT FGGGGGFGFGEDGGGGGGGGGGDD:EEEEEGEDFEEEEEFBFEGDEFDDG=BEEEFEBAEEBB=EAC:??EEEE>??BEE?BBFB@?C### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:89T1 foo_853/2 147 chrX 119575275 60 91M = 119574899 -467 GACTAAGCTGCTGGACTCCAAACTGCTACTCATTTCTGGGCTAAGGTGCGTCTTGTAGCCCAGGTATTAGAAAAATGTAAAGAGCAAGAAA #######################B==6B@=;6?>AA>?;>@?-@=@@=?=-CCCDC=D?F>EAEED=EE5CCA>F=DBDGA?E? XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:17A73 foo_855/1 81 chr4 170955375 37 91M * 0 0 GGAAGAATAGCCCAGGAGTTTGAGATTAGCCTGGGCCTCAAAGCGAGAACCCCATCTCTTCAAAAATTTTAAAAATTAGCCTGGCATAATG ##########################################?939@@>C6A6CC?8>7()==:B=>@=?@>>>5=@=>.@76+27B66;@ XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:37A21A31 foo_856/1 83 chr18 36681794 5 91M = 36681411 -474 ATAACAAAGACTTGGAACCCACCCAAATGTCCAACAATGATAGACTGGATTAAGAAAATGTGGCACATATACACCATGGAATACTATGCAG FDF=BFGAEEGC=ECC5;CBACCCCDEEEEGEEEGGGEGGAGGEGGFGGFGGGGGGGGGGGGDGDGFFGGEGEGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:5 AM:i:0 X0:i:1 X1:i:63 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_856/2 163 chr18 36681411 15 91M = 36681794 474 TGAACAGACACTTCTCAAAAGAAGACATTTATGCAGCCAAAAAACACATGAAGAAATGCTCATCATCACTGGCCATCAGAGAAATGCAAAT GGFGGGGGGFGGGGFGGEGGDGCGGDDFFFBEEAEGFGGFGGGFGGGGFGGDGAGBEEDEGFFGGFFGEGGD:GG?EEC=CDEECEDAEEC XT:A:R NM:i:0 SM:i:0 AM:i:0 X0:i:496 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_858/1 83 chr10 26879805 60 91M = 26879418 -478 TTTTAGTGGGCGTATGCAGAAATTGTGAGCAGATTGACTTTTGTATGTCATTTCAGCGCTAGAGGGCCTCCTAAGTCCAGGTTAGACCCGA E:ED;CFGE?FEEEEDB@BC@A5GBGEEF:FEGGGGEBGGDGGFEDFFFEAEGFFGBDFDFGEGGGGGDGGGGGGFEDEEEFFFFDEBEEE XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:57A33 foo_858/2 163 chr10 26879418 60 91M = 26879805 478 CAAATAGCCTGCCTTCAAGCTGACTGATTATTCTGCTTGATCAATTCTGCAGTTGATGTCCTCTGTTTCATTTTTCATTTTGTTTGTTATA FD?GGGGDFGFGGGGDGFFGFGCGDEE?EDFFFFEGFFFFFFDFFGGDGFEGDDGFAFFAFBFBAE?E:?FEFFFABAFFGGG=:@C=?-=CA?=A<>54/.5=B<==>>>58=1B:66@C;;708+;,A<3DDDDDDDD:DDD?DD XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:6T84 foo_868/2 163 chr6 27518977 60 91M = 27519370 484 ATTCCACACCTGACTTAACTGGGCCTGGAGGGGGAGGTAGGTGCCCTTTTTGCTCCTCCTTGCCACATTCAGGCCATTGTTTCTCTCCCTA A>ACA:CCC?E?CEFBFFGGGBEFGE?=AD=9=CE?DEAFDFBAGGDGFFB=FFFEFFCC=C-ADEEEDD5DDDBDAD?BDAEEGGGEEEE=BGE?GGFFE?FE?GGGGGFGF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_880/1 83 chr7 72008981 23 91M = 72008612 -460 TAATTTTTATTTTTTGAGATGGAGTCTTGCTCTGTCACCCAGGCTGGAGTGCAGTGGCGCAATCTTGGCTCACTGCAACCTCCACCTCCTG ED8>@8?-@@=?@>BB@BBC?ADD;A=>?:DBC?EAEAEDEF?FDFDFFFFEFFCDD?:FFFFFB?EEEC?DCDAA8>AEDEEE XT:A:U NM:i:0 SM:i:23 AM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chr7,+74933732,91M,1; foo_880/2 163 chr7 72008612 14 91M = 72008981 460 GGCATTTGGATAATCAGTCCTAAGGCCCCAGGCCCTGCTCTGTCATTGACTGTCATTTTGAGTGAGTCATTTGATCTCTGGGTCTTTTTCC 5?DD=DC==:EDEEEF?CEBAA?C:B?BE:=C@@@>>B.;@B@BCDB?D=EAE=EEEE=:3<6?*7(@47@@@.>?AA?-=B0BBEEEB>D XT:A:R NM:i:0 SM:i:0 AM:i:0 X0:i:2 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 XA:Z:chr7,+72008612,91M,0; foo_882/1 83 chr6 23795237 60 91M = 23794855 -473 CAAAATTCCCACATCGTTTGCATGTCTTTCTGCATTTCCATTTTCTATTAAAAAAATCTATCTCCATATTTTATAGTTCTTTGATTTGCTA EEADEE4D?EBAAAAC:==>=CB4AD=DB?BEE=BEAEE;E=EE=EEDCAE=EEEDEEDEEBCEDEEEEEDDDDDDD=EEEEE?EEE:EDD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_882/2 163 chr6 23794855 60 91M = 23795237 473 CATGCACAAACCAGCAAGCCTCTAAAAAAAGTCTTGAGGTTACAGTGACTAGAGATTGTTGTTTAGGCACACTGCTCCAATACCCTCATAC D?DEGGFFGGFGEBGGG=A?EEEEEEGF?EBCDEDEBEBBDDDDD:@@@CEED?=:AAAA=EED-=?.>12?9:A:;<:A########### XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:67A23 foo_884/1 65 chr16 63049877 25 91M * 0 0 CAAGGAATCTAATTATATATATGCCAAACACTGGAATCATGAATAACACTAAAATAATCCATTTTAATGACATCAATCTGGGGAAATAGAG 2756,22772,2,777227,1:;:;6,2272=7@62>,+20,7@7;A@;;AA><@*1-):BBA?B########################## XT:A:U NM:i:4 SM:i:25 AM:i:0 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:4A25T6G42T10 foo_885/1 65 chr4 24040080 37 91M * 0 0 CAAAGATGAGTTGGTCTCCTGCGTCACCCCCTACAAGGCCTAATGAGGGAGACAAAGGCATATTAGCAGGTGAATCACATTACAAAAAGGC GGFGGGGGGGFFGGFGAGGFEFGGGFGGGGGGGGG5EEEEFGEGFGFEFEEADF=FFDDDAAE?ECEECE;B0-=??AA?DEEABEDAAAC::B######################### XT:A:U NM:i:3 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:3 XO:i:0 XG:i:0 MD:Z:69T15G0T4 foo_890/1 83 chr5 58979821 60 91M = 58979447 -465 GGAACAAAGTGCTCAGGAGATATATTCACTGACCAATTCCCCAAGTGCTCATGTGTGAGCAATTCAAAAATTCTAAAAATACTATTCAGTG EGGEEGEGGEGGEBEGEEFFEFEBEEGEFEGCFEFBGEFGGGGGGGGGEGGFGFGGGGGGGGGGFGGGGGGGGGGGGGGGEGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_890/2 163 chr5 58979447 60 91M = 58979821 465 TTATATATCTTTGCAGGAAAAGGTATATGTAAAATGAGGTAGAACTAAATCAGTGATTCTTAACCCAGAATCACTACACTGTAATAGAAAT GGGGGGGGGGGGGGGGGGGGGFGEFFFFFFFFGDGGGFGEGGGGFGGGGGFGFGGGGGGGGGGAFFFEFFAFFGGFBFGFGFFFCEFFFFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_892/1 65 chr4 187482106 37 91M * 0 0 AGTGATTACTTCTGCCTCAGTTAGGAAAGAATTAACAGGAGTAATTATAGTTGAGCCAGATAGGAGTTTTCCAGGAGGAGCACAACATCAA GGGGGGGGGGGGGGFGFGGGEGGGGGGGGGGGGGGGFGGBFCFFFFEFFFEGGEEGGFF@FGFGEE=ACEFFBDFEFFDFFGGGF=FFFFF XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:47C43 foo_893/1 83 chr4 166802072 60 91M = 166801716 -447 CCAATTACATTTTTCCTTAACATGTACTAGCATACTTACTGAAAAGATATTTTTCTTATACTATAACCTGTTAAGCATATAAATAATTTGT #A:DDCACA>>E?5EEEEEBBACA5;BCEDEB>=B=@5ACCD=DD?CB@@BDF=DFDEEEE?FDFDDFD=FBFFFFFDDFDFFF?E?FFFE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_893/2 163 chr4 166801716 60 91M = 166802072 447 GTTACCAAAACTTGAAGAACGGGAAATGCCAACATTTTGCAGGTTTTACTATGATTGCATCTATTCAGCATTAGAAAAATCCTAAATTTCA DDBGFFEAE?FFGFGG?FGGEDFD?EFFFFGGGDFGGG?A?DEEEDEEEBD5CADFFGDDFGFGGGGEGFAEDAADDB@2EEDECB?A?DD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_895/1 83 chr1 192349686 60 91M = 192349312 -465 AAATGCTTTCCTTCAACAGGCCTGCTGTGTTTGGTTAAAATATCTTAAACATTTTATGTATTGTTAATTTTTTTACACTTTGAAAGCTTCC GGGGDGGDGGGDGFGEGFBGGGDGGGGGGGGGGGGGGGGGFGGFGGGFGGFDGFGGGGFGGGGGGGGFGGGGGGGGGGGEGGGGFGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_895/2 163 chr1 192349312 60 91M = 192349686 465 AAAATTCTCCTTCTGCGTGCTTTTAGCTGTGATTTTTGAAGCAGCTTATAAAAATAAAAATAAAAAGTCACAAATATTTTCGATAAGCTGT FGGGGGDGGGGFGGDGGGFGGGGGGGGGGGFFGGFGFGDFGGGGGGFGFGGGGGGGFGAGEGAGGGGEGGGGEDGGGGGGFGGG??EEEC# XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_897/2 145 chr13 46459041 37 91M * 0 0 AATTCACATACATTGTTTTTCCTTCTTTGTGAGGTTATTTTGTCAATTAAATGATTTCTTAGTGCCGTATGAGTTATAATAGGGTGGGTAG GEEEEADDDABDGGGFDGFGGGGFGEFDGGFGGGGEGGFGGGFGGGGGGGFGGGAGFGDGFGGGGGGGGGFGGFGGFGGGGGGGGFFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_898/1 99 chr6 150659863 60 91M = 150660223 451 TTTGCAAAGATTATGACAGTAAGAAAAGTCTAGGTGGCTGACTCCATCTTGCTTCTAGCCTCACAGGCTAGCTGTCCCTGCTCATTCCTGG GGGGGGGGGGGGGGGGGGGEGGGEFGGGBGFFFFCGGGGGGGFGGADGEEGAEEFEEGGGGGGEECFEDD6:5:77?############## XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:77T13 foo_898/2 147 chr6 150660223 60 91M = 150659863 -451 TGTGGAGCCTAAGATTGGTCTTTTGAAATGTTTTTCAGACTTTTGCATTCTGGTGATCAACAGACTCCACCCAGACCCATGACTCATGACT ?E5DBEGGBGEFFF=FDFFF?GGGGGFD=GFFFF?FEE=E?FFGGGEGGGEGGAGGFFFFAEFCEAFFDFFGGGEFGGEGGGGGGGGGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_900/1 83 chr18 27108044 60 91M = 27107673 -462 ACAAAGTTCACAGATATAAAATTTTGAAAAAATTTTGAGCCAGTAAAATTTCTTCCAGAAGGCAAAATTCCATGAGGTTTTGTTTTTAATA EGGGFFGEFEGGGGGGDGGGEEGGGDGGGEGGGGGFGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_900/2 163 chr18 27107673 60 91M = 27108044 462 AAGGCAAATTAGTATTACATTTGCACCATATATTCCAAGAAATTCAAAATTGCTCTAAATTATAAACACAGTATTTCTTTCAAATAAGTGG GGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGFGGGEGFGFGGGGDGGGFFFAFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_902/1 83 chr18 1844801 29 91M = 1844371 -521 AAAGGGGAGTTTATACACACACATACATATCCACATATACATATACACGTATATATATAAAGGGGAGTATATATATATACATATATATATA <>C=0>=92;:4=?=B041593>00:510:(+)89:=389@3@/589>?+B+?;==:7=1269>?:5C?CC?B=:>==?ACD=?D:CAC5C XT:A:U NM:i:2 SM:i:0 AM:i:0 X0:i:1 X1:i:1 XM:i:2 XO:i:0 XG:i:0 MD:Z:30A17A42 XA:Z:chr18,-1844801,91M,2; foo_902/2 163 chr18 1844371 37 91M = 1844801 521 ATCCAAATAGGTGCTGGATATTATGCTGGAGAAGGCAACTGGGCATGCTAATGTATGACTTTTACTAGTCTGAGCCACGGACAGTCAGCAC --;9-:;7996>6=>>;=66682,7>@;;6B==:.;9;:?:-555@@>@=;5?DA=>?################################# XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:84G6 foo_904/1 83 chr1 147465983 60 91M = 147465581 -493 GAGAGGCCTCAGTTAAGTCTATAGCTGTTGTCACCTGTTGAATCATCTCTAGTCTTCAGAATACCATGAAATTAGTTTTCTCAGAAGTAAA ;@@CCB??:?=?B?DBB:56*DA?EEBBCCCE9@@>DEEEB=B?=5*5)371)640?A??## XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_906/1 99 chr2 107216341 60 91M = 107216698 448 AGAAAAAAAATCACATGAGAATTAAAGAATATCTTAAGACAAGTGAAAACAAAAACACAGCAAATCAAAACTTACAGAAGCAAGCAAAAGC GGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFGFGFGGGGGGGDGGGGFGGGGGG:FEEBEEEEEFFGGGEGD?AFFE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_906/2 147 chr2 107216698 60 91M = 107216341 -448 TAAAAGAGAAGACTCAAGTAAGTACAATAAAAAATGAAAGAGAGGACATTATAACTGATACCACATAAATAGATAAGACTATAAGAGAATA #FFFCFGBGGGDCEGGFGGGGGGGGGGGFGGGGGGGGDGGGGGGGGGGFFGEGFGGFFFEFGBFGGGGGGGGGGGGGGGGGGGGGGGFGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_908/1 83 chr2 6850828 60 91M = 6850442 -477 TGGATCACATATTTTTAAAGAATGGACTCCTTTTTCATTTTCCACATGGCAAGTATGTTATCCTCAGTAACAGACTAAGGTCTTTGCATGA #######A37A@C==C??>>>=,>?;8:667+>:@>?>A@=5DBDCAB?:EDDDADCBD=DEEBED?AAAC5EFCFEEEEADEADDECEEE XT:A:U NM:i:2 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:4G26A59 foo_908/2 163 chr2 6850442 60 91M = 6850828 477 TCCCCATAAGATTTGTCCGAATAATTGTGGGAGATGAAAACAAGGATATACAGGGGAGGCCGCCATCAGTCGTCAGTGCTGTGCCAAGTCT DFEDAFE:?DFE?DEE;DDAEEDDEEB=BD@6@>@EE=BEDB>>?6C??########################################## XT:A:U NM:i:4 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:53T7T7C16T4 foo_910/1 83 chr9 35126274 60 91M = 35125895 -470 TTAGGGCTATGACATGAACCCTAAAATTCCTGTTCCCTGAAGGTGGAAACCAAAAGAAAGTATCGCCACGTGGTTAAAAGTTCAAGCTCCC FCDEEACCEBB>@@@?C8CCCBEEEFE@BBECF?EEEGEFEEGEEEGGDGGEGGGGFGGGGCEGGGGFGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_910/2 163 chr9 35125895 60 91M = 35126274 470 CGGGTGTAATGGCACAGGCCTGTAACCCCAGCTACTCAGGGTACTGAGGCAGTAGAATCACTTGAACCCAGGAGGCAGAGGATGCAGGGAG FFFFDGGDGGGFEAGGFGFGGGGGFCGGGEDFEFGFG=GGFAFFEFDFFFGG?FGFC?F=BEEDEEFDG:BBA@B=ABAAA*8C;A=?5CEGGBGEDGEBFFF?DDAEDED=EEE@C;@>C@C6@CDEA=FDFDFGGEGGFDFEDEEDEDD=DDACCCCCBD5BD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_917/1 99 chr5 32777132 60 91M = 32777504 463 TTAGCCAGGCATGGTGGCATGCACTTGTAGTCCCAGCTACTTGAGAAGCTGAGGTAGGAGGATCACCTGAACCCAGGGAGCTCAAGGCTGC GGGGGGFGGGGGGGFGGFGGGEFGGGGFGGGGGGFGGGGGFGGFGAGGGGFFFFDEDEBEF?=FFBEE:?FFFFB=EEBEBCE5ECE5CE# XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_917/2 147 chr5 32777504 60 91M = 32777132 -463 TTAAAACTGACTTTATAAATAAATAAAAATTAGAATTTTAAGTAGAAGTTAGAGCTAACTATAATAACTTATTTTAGTTCTGAAGATGTGT GFFAF=EFFGGGGFGGGGGFGEFGGGGGFAGGGGGFGGGGGGDGGGGGGGGGGFGGGFGGFEGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_919/1 83 chr6 105189052 60 91M = 105188677 -466 TTCAGAAAAGTAATCAAGGAGAGACGCAACACAATGACGAACTGGGTTTAGATTTGCGTGGCTAAGATTCCTAATTTTAAATAATTCCATT AEEAEGGAGFBFGGEGDGAFEFGEGFGFDGDFFFFF=FFFAGGGFGGGFGGGGGFGFGGGGGGGGGGGGGGGGGFGGFGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_919/2 163 chr6 105188677 60 91M = 105189052 466 TTCACTGTGTCATCCAGCTGGAGTGCAATGGCACAATCACAGCTCACTGCAGCCTCGAACTCCCAGGCTCAAGTGATCCTCCCACTTCAGC GGGGGGGGGGGDGGGGGGDEFFFDFFFFFFGGGGGFGGGGGGGGGGFFGGGGGGGGGFGGGGGGFFGGGGE:FEFF:FEFEEED=BAEECG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_921/1 99 chr7 66655187 60 91M = 66655576 480 ATGTGTGATTTTCATGATTACTTAGAAAGAGGTGGATTTATCCAAAGAGAGCGAATGGACCCGTTACAGTCAATATTTCTTGCTTCCTTTA GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFEFFFFFEFGGGGGGGGGGGGGGGGGGFFGGEAGBCCECC:ECCFFFEBEAEFEDGGGGC XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:22G68 foo_921/2 147 chr7 66655576 60 91M = 66655187 -480 GTGCAAAAGTAACTGAAGTTTTTTTGCTGTTACTTTTAATGGCAAAAACAGCAATTACTTTTGCACCAATTTAATAAAAGGATCCAAGATC F?EGGGEDFFFD?:FFD>GGGEGGEGGDGFFEGGFGGGGGGFEEDFGGGFGGGGGGEFFFFGFFDGGGGGFGGGGGGFGGGFGGGGGGGFG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_923/1 99 chr11 51224961 60 91M = 51225337 467 TTGCATTTCTGACATTGGCCTCAAACAGCTCCCAATGTCCATTCTCAGAATGGACAAAAACAGTGTTTCCAAACTGTTGAATCAAAAGAAA GFEGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGEDGGGGGGGGGGGGEGGGGGGGGEGGEEEEFCAFEEDECEEGEGGGGFGFGFEGEE= XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_923/2 147 chr11 51225337 60 91M = 51224961 -467 AATGTCCATTCACAGAATGGAAAAAAACATTGTTTCCAAGCTGCTGAATCAAAAGTAAGGTTTAACTCTGTGAGATGAATGCACACATCAC EEAEAEEEBEEFEEGGGGEEGGFGGF=FFFFGGGEG?GGGGGFGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_925/1 99 chr8 24087137 60 91M = 24087502 456 TGAAGTCCTGGGGGAGATGTTATGAGGAGAGGAACCTGGGCAGGAGGATTTGTGCTCTCTAGCACAGTGTCACCAGGACTGGGACTGAGAT GGGGFGFGGGFGFGGE?FFFGEGGGFFEFFGFGGGGEGGGDDGFEFFAFFGFEGGGFGEAGGGFGFGEDDCCBBBEEDEBGFFFFGGGEDF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_925/2 147 chr8 24087502 60 91M = 24087137 -456 GTATTCCTATTAAAGGATCCTGTAAAATTAGTGCAGAAGGTAGGAATGACCCACATATGAAGCTCTGGTCTTGGTCCACAGATGCCTTGAT DFFABFD?EABDDD?:?FFDBGDDEGEGBDDEEBEDFDBBFFGGGGEEEDCGEGGGFDGGDGGEGGGGGEGGFGBGGGGGGFGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_927/2 129 chrX 9092306 37 91M * 0 0 CCACAATGTGATACCACCATACACCCACCAGATGACAAAAATTGAAGACCGACAATACCAAATGACGTTGAGGAATTGGGGCAACTCGAAT GGFGGFGGGGGGGGGFGFGGGGGGGGGGGGGGDGGGGGDGGGGEGFGGFGGGGG?GFGGGEFFGGGGFFGAFFBDFE5FFGGFGGD5E:B= XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_928/1 81 chr4 90683007 37 91M * 0 0 GGGGTTGGTCCCTAGGGATACAAGCTCTAGGCCTGCCCTCATAGGCCTAATCAACAAGTCTGTCCCAGTGGATTCAGGTTCCAGACTCAAC #CA=@C:3%(7,40A:CACA:AAC5C?=>@@=?C?AA?FFFFEEEEEAEFEFFDFFA XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T90 foo_929/1 99 chr6 136121859 60 91M = 136122250 482 CTTTCTATATCTTTTTGCCAAATCTATATCTGTCAAAATCTGACTTTAGGCATATACCATTAACATAAATTGATATTTTTGGACAAAGCTT GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGFGGFGGGEBGBGGGGGDGGGEFGGGGEFEGGGFDGFDGFFDDEDDGAEGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_929/2 147 chr6 136122250 60 91M = 136121859 -482 CTTTTCCATTAAAACAGCACCCTACTTCAGTGCCTAGTATATGCTAGGTGTTATTGGGGTTACCTTCAAATTTGCTCATTTGGTCTTCATA FDABAFF?FEFDAAD?EEED=FCDFAGGGGGDGGGGGGGAGFBFFDFFFDFAGGGGGGEEGCGGGGGDGGGDGGGFBGGFGGFGFGGBGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_931/1 83 chr2 91684006 9 91M = 91683626 -471 CAGTTTTGAAACTCTCTTTTTGTAAAATCTGCAAGAGGATATTTGGATAGCTTTGAGGATTTCGTTGGAAACGGGATTGTCTTCATATAAA #####################A5?AA5=-??=B??B>;?5?0231'96?*13254264.@?;3-649:;:8;;7:8DBAADDDB=DDDD=D XT:A:R NM:i:2 SM:i:0 AM:i:0 X0:i:6 X1:i:5 XM:i:2 XO:i:0 XG:i:0 MD:Z:12A50A27 foo_931/2 163 chr2 91683626 9 91M = 91684006 471 TATCTACTCAGCTAACAGAGTTGAACCTTTCTTTTGAGAGAGCAGTTTTGAAACACTGTTTTTGTGGAATCTGCAAGTGGATATTTGTCTA CC>@@AB:>>CCAC=:896A?==9;C=CC?EEE>>44=*;;22@6DDDB:AA+AA?;A:9@>7?-EEDE:FFFDDFGGGEE?AE?GFFGCEFDDGFGBBGGGGDGGGGGFGGGGGFGGDGGGDGGGGFGGEGGGFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_937/2 129 chr9 34190929 37 91M * 0 0 TGGAGTTTCGCTCTTGTTGCCCAGGCTGGAGTGTGATGGCGTGATCTTGGCTCACCGCAATCTCCGCCTCCCGGGTTCAAGTGATTCTCCT GFEGGGGGGGGGGGGGGDGGGGGGGG?GGGEBEBEEEDEEFDF=FEEDEEGAGFDFGDE?GGDBGGGGGFEEADE4:A?BFBD?EEBDC== XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_938/1 83 chr14 98494687 60 91M = 98494297 -481 AGCTTGGTAAGTCAGAACGGTCAAAGGCCAGGACTGTGTGCGTCCACACTTGACCTGCCCACACTGCCCCTGAACACCCTTCAGCCCCGGA :5@A==@B=CB:CABB66;)4;:>5>>>?->B@=EEC@->C?@DD?EBEEEEDGGFEEEDBE=FEFFFFAEFGGGEGGGFDGFGGEDGBGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_938/2 163 chr14 98494297 60 91M = 98494687 481 CACATTAGAGTCATTAAAAATCATTAGGATGGTTGAGTTCTGAGGTCAGATAAGCAGGATAAACACACTCCGGAGAAACTAGTTCTCCCTT FGGEGGGAGFGDGGGFFGDDGFEFGFGFFBBDBDDEEBEBFFDFBFDDFFAEEFFFCEAGEB=DBDDEBB?=CAC?CBBDEA:DE:BA?BF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_940/2 145 chr4 25985810 37 91M * 0 0 CTCAGGAGTTTGAGACTAGCCTGGGCAATATGGTGAAACCCTGTTTCTGCAAAAAAATACAAAAATTAGGTGGCCATGGTGGCTCGCACCT GD?FFFDDFBGGDGGGGG?GGFGGGGGGGGFFGGGGGGGGGGGGGGGEGEGGGGGGGCCGGGGGGGDGGGGGGGGGGGGGGGGGGGGEGGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_941/1 99 chr9 24609070 60 91M = 24609419 440 GGTGATTCTCAGCGGAATGGACGGGGAGCTGCAAAGGGGATGGAGTGGGAAGATGACCTTCGCCTGGCGTTCACCAATCCTGCAGCCAATC FFBFFFEFEDFEEBFFBFFBDBDDEEB?EDABA-A:=BB:.?>.7*BB==AA=5:>>?9=?@BD5:70A,?=>?>1:B=BCA:@AD5:?D> XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:67A7C15 foo_941/2 147 chr9 24609419 60 91M = 24609070 -440 CTCTCTTCTGCCCAGTATTTTTTCCTGCCTCCTGTCTGTATCAGTGTAAACAATTTTTTTTTTAAGTTACAAATAATTTTTTATAAAAAAT 1?AA?B?AD=D5FG:GFFEFFEADBEC>C;=CACA:E?EE:C?C:CA;A>:F@5EEGGGBGEFFFFDDBDDFD?EEGGGGECEBFBFEFBF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_943/1 99 chr1 121185968 60 91M = 121186347 470 AACTGACAGAGTTGAACTTTCATTTAGAGAGAGCAGATTTGAAACACTGTTTTTGTGGAATTTGCAAGTGGAGATTTCAAGCGCTTTGGGG GGGGGGGGGGGGGGGGGGGGGGGGGFGEGFGGGGFGGGGGGGGGGGGGGGGGGFGGGGEBEFFFFADGFGFFFDFFGGGFDGFFD=DDEEG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:58T32 foo_943/2 147 chr1 121186347 60 91M = 121185968 -470 TGAAACACTCTATTTGTGCAATTTGCAAGTGTAGATTTCAAGCGCTTTAAGGTCAACGGCAGAAAAGGAAATATCTTCGTTTCAAAACTAG EBEFEGCEE?E??EEGEEEGGBFDGGGGFFGDGGGFGGGEGGGGGGGGGGEFEFGGGGGFGGGGGGGGGGGGGGGGGGGGGGFGGGGGFGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:56T34 foo_945/1 83 chr4 184591595 37 91M = 184591221 -465 GCGGCCCAAGCCTCCCCAACGAGCACCGCCCCCTGCTCCACGGTGCCCAGTCCCATCAACAGCCCAAGGGCTGAGGAGTGCGAGCGCATGG ###########C@CBB::119BAB->;CE=EEEDDE5EEDEGGGGFGGGGGGAGGGFEFFBEGGGFGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:7G32T50 foo_945/2 163 chr4 184591221 36 91M = 184591595 465 GCACTTGAGGAGCCCTTCAGCCCACCACTGCACTGTGGGAGCCCCTTTCTGGGCTGGCCAAGGCCAGAGCCCTCTCCCTCAGCTTGCAGGG GGGGFGGGDGGGGEGGGGGFFGEFGGDFGGEFFGFEFFFFB?FFFFDFFDFFGFD=DEEEFA::EC?=:ABD=DAEEE=EB5:@A4;.8GGGGGEEGDGFGGGGGEGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_957/1 83 chr3 167410024 60 91M = 167409640 -475 AATAACAACCATTGCTCAAATTTGCTGGAAATCAAACTTACAGCCCACAGATACCCACAGAAGAATAGCAGGATCAATGTGTACACTAAAT DCCC5CCEEDED=BDA?BEEEF:FDFEBEEC:AD=A@@@>?DDD5ADDD=DDADADFEDFFEDEE?DFBFFDFFDFEEDDEEEEBEDDFDE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_957/2 163 chr3 167409640 60 91M = 167410024 475 TTTGCTAAAAAGAAATATGAAAATTATGTAGAGGAGTAACATCAGAAAAAGATGATGGAGTAGGCAGCTACAAACTCCTTTCCCACATGAA GGGGDFGGGDDAEEDGGGG=FDFFFGGDE?DEEECEGGGEGGG?GFDFACEECDF=DDADAD=DCEDE=EBAADABCBDEA5AA:77CA?F XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_959/1 83 chr1 206918225 60 91M = 206917853 -463 TAGAAACATATACAGAAGAACAAAACTATTAGGATTTCACACCAAAACCCTTTAAATAGGAATTTATTAAACATTAATTGTTCCCTCTTTC EFEFDEFGFCFEEGGFC??CEABA@?EDAEGFEEDEEFDGCCBC=CCED:EGGGGGEGFGGGGGFGGGGGFGGGFGGGGGGGGGGGGGDGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_959/2 163 chr1 206917853 60 91M = 206918225 463 TCTCAAAAGATTAAGAACAGATTATATTCTTAATCTTTTGATTACAAAGAGACAAAGAGCATTCAAACAGCTGAACTCTTATTGTATTGCC GGGGGGGGGGFEGGEGGGGGGGGGGGDGGGGEGGGGGFFFEGGGGGGGGGGGEGGGGGGGFGFGGGEGBDFAFFFGGGGGGFEDDEBE=A= XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_961/1 83 chr3 36806989 60 91M = 36806613 -467 TCTCATGCCCTTACCTGCATCCCATCCCTTCCCCCAAGCTGGATACTCATCCTTCAGCTTATCACCCATAGGGTCTCCATCTAGTTGTCAC @EEEA=5B@?@=;C@>>A:5ABB::CEEE7EE5EEEEEEEDCCADDEEEEEDCDD=BEEEEBE:EEEEAEEAB:A:EEECECEEEEEEE?D XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_961/2 163 chr3 36806613 60 91M = 36806989 467 AGCTGTTGCCAGCTTTTCATCAGTCCTTTGAAAGGAGTTGTTGACACTGCTCAGGGCCACCTTCTCTACTCCAAGTTACTCTTCAACCCAA C>A:??AAAC?DABD:DCA?CCDDEE:ED:CCC??BBCC:B?=D:?AC?>:CA=@:,@@2;.:8:AA;>>/;8:4?C5:A0(;*=??=A5> XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_963/1 83 chr7 142308461 37 91M = 142308054 -498 AATGAGACACTTTAAAATATCCTAAGTGACAAAAGCAACAACTCCTTTTCTAATCTACTAAGCTATTATATGTATTTTAAGATAAACATAT DDDFBDDFCDEFEFEFBCCCDGDEBFEECAEEF@FFGEGGEGGGEGGGGGGGEGGGEGGGGGGGGGGGGGGGFGGGGEGGGGFGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_963/2 163 chr7 142308054 29 91M = 142308461 498 GTAATCCCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACGAGGTCAGGAGATCGAGACCATCCTGGCTAACACGGTGAAACCCCGTCTCT GGGGGGGGGGGGGGGGGGGFGGGGGGGGGGEE2EE?A@@B?=CC=DBCD@ECCEBCBBCEB:BEB@A==??#################### XT:A:R NM:i:0 SM:i:0 AM:i:0 X0:i:47 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_965/1 81 chr2 85059521 37 91M chr11 133740820 0 GTGTTTTTATAGAGACAGGGTTTGTCCATGTTGGTCAGGCTGGTCTCGAACTCCTGACCTCAGGTGATCTGCCCACCTCAGCCTCCCAAAG 7GGGFGEEEEECEEEEFFFFFGGGGGGGDGGFFGGGBD-BBCEDCDFC==?@C,AA??>5;B:@@@? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_969/1 99 chr8 75728528 60 91M = 75728901 464 TAAATACAATGGCAAGTATAAAATAAAGTATTTTTTTGATTCCCCATTCAAGCAGTAATTAAATTACATGTACCCATAATGTACATTCCCA GGGGGGGGGGGGGGGGFGGFGGGGDGGGDGGGGGGGGGFGGGGGGFGGGGGGGFGGGFGGGGFGGGGGFGEGGEGGGGGGGFFGGGGGEGD XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_969/2 147 chr8 75728901 60 91M = 75728528 -464 GAGAGAATCGTATTTGGGGTCTACAGAAGAGCAGTAGAGATATGCTTTTTTCTTCATTTTCAGTTAGAGGAGTTTCAAGGAAAAGGTTTGG GEGGDGGFGGGGGDFGGFGFGGFGGGFFGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_971/1 83 chr19 42476835 29 91M = 42476452 -474 TTTTCTTCCTCCTGGGGTTTCTTCCTGCTGGTGGACCCTCCGCGAATCCCGGCCTCCGGAGACCGTCCTGGTAACTGCCCTGGCCAGGACT #######?<;>;>CAA,9208?,@?@>>&>>:CB6BC?:EBDFDEE?GGGFBGFBFFDFEEBEEE?GGBGGGGFGGDGGEGGGFGGGGFGG XT:A:U NM:i:1 SM:i:0 AM:i:0 X0:i:1 X1:i:3 XM:i:1 XO:i:0 XG:i:0 MD:Z:74A16 XA:Z:chr19,-42471784,91M,1;chr19,-42476835,91M,1;chr19,-42241437,91M,1; foo_971/2 163 chr19 42476452 37 25M2D66M = 42476835 474 CGAAAGAGCGAGAAGGGAGAGAGACAGAGAGAGAGAGAGAGAGAGAGACGTGAGAGAGAGACAGAAGTCGGCACACAGACACGCACTGCGC GEBGGGDG?GGGGGFEDCAEBEDEEGGEF=EAEADDE=FCEEECEAA5CCADBDDECEAA?5A??>9.6;6>>@??=?B=(3(.=B?C-B@ XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:2 MD:Z:25^AG66 foo_973/1 65 chr9 81954188 25 91M * 0 0 TCCTTTGGGTATATATACAGTAAAGGGATTGCTGGGTTGAATGGTAGTTCTGCTTTTAGTTTGTTGAGATCGTTCTCTTTTTCGTTATTTG =BEEEEEEC:DDDDDADDDD=DDDD?A?CC=DD:=CC=C:6>@@@CA?A=@;-.6:>,=*<<99:6,>.;)8*67>=BB?A=(AAB-B>>C XT:A:U NM:i:4 SM:i:25 AM:i:0 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:52A17A5A5A8 foo_974/1 83 chr10 42475636 60 91M = 42475248 -479 ATAAGCAACCTACATTTTCCTCCCTGTATCTCATGTTCTGTGTCAACAGATGTGTGCTCTGGAGGACAGAGCAGTTCTCCTATGTTTCTTT GFGAGGG@GGGEGGGFECFFEG?GGGGEGDGGGGFGDGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_974/2 163 chr10 42475248 60 91M = 42475636 479 CCACTCAGGGCATTCTGTTCTTCTTACCAAGCTCTGCCATCATTAAACTATTTTAGCAAAGCCTGACTTACTGGGGTTTTGTCTGAGTTTA GG?GGFAFFDGGGGGEGGGFGGFGGGGGGDGGGGGGGFGGGGGGGGGGGGGGGGGDGGGFGFGGGDGDGGGGFFFGEGGGGFGGDFF=FFF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_976/1 99 chr12 95784026 60 91M = 95784378 443 ATTTTGATGTTAAAAAATATATTAAGAAAGAAGTTTCTCTTTACTAACACTCATTTTCTATTTCTACTTTCCACTTCAGTAGAGCCTTATG GGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGFGEGGGGFGGGGFGGGGGGGGGGGFGGEEGGGGGGGGGGDGGCGGGEDGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_976/2 147 chr12 95784378 60 91M = 95784026 -443 ATATTTAAGATAACTTTCATACATTTTGTATTTAACAGCTTAGAATGGAATAGGTCTAAAACCAGATTCTGCCCAAGTACAAACCACCCTG GDGGFGGEGEGGCGCGGEEGFGFGGGFGGGFGEDGGGGEGGGGGFFDGGGGGFEFGFGEGEGGGGGGGGGGGGDFGFFFEFFEFFCEEEEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_978/1 99 chr4 26714158 60 91M = 26714521 454 AAGAAATTTTTTCTATCCTATCTATTATTTTACCCTAGTGCCTATAAGAGCACTTGGCACATCGAAGGCACCCAGTAAATATTTCTCAATG EEEDEFEFFFEFFC>FFFDFFDDEFEFDEF:AEEEEACACE?EDDFFFEDDDEFAFDFDBDDDDD5?@:=;347=?A5>?8?5?>@D?:?@ XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_978/2 147 chr4 26714521 60 91M = 26714158 -454 AGCACTTTGGGAGGCCGAGGCAGGCAGATCACCTGAGGTCTGGAATTTGAGACTAGGCTGGCTAACATGGTGAAACCCTGTTTCTACTAAA @:>6?>-EDDEFE:FFE=AEADG:GGDDDDB?:DEEEE5EEDDDDAF5DFDEEEED==BEEGGDGGCAAA>DDDADD=DC=EBEFBEEEDE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_980/1 83 chr6 70510288 60 91M = 70509922 -457 AGGATTTTATAATCTAAAAATCAGACCTAAAATATTTTCACATAAACTAAGAAAATATCCATAAGTAGAACTCAAAATATGGCAAAGGCAC EGFGGFGFGGGDGGCGFEFF=GEG@GGFGGGGGFGGGEGFGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_980/2 163 chr6 70509922 60 91M = 70510288 457 TATCTAAAAATAGCATTCATCTCATAGGGTTTAATACGCAATATTCTAAAACTCAGTATGTCAGTTTTAGGTAATCAAAGATATTGTACAG GGGGFGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGFGGGGGGGGGGFFGGGGFGGGGGGGEGGGDGGEFFGGFGGGFE:DGDEEEAF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_982/1 99 chr14 96016952 60 91M = 96017322 461 GGATGGGAGATGGGGAGGGCGTCTCACAGCCAGGGATGCAGCCCGGCCTCTGCAATGTCTGGAATGGGGCCCTGGAAAGATGCTGGGAATC GFFGGGGGGGGGGGEGGGGGGEGGGGGGGGGGGGGG=FGGEFGFDDDFAEEGFEEED=E?CB-CCDDDEADDCCDCEBA==@?>=A?5>5= XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_982/2 147 chr14 96017322 60 91M = 96016952 -461 TATTGCTAGGGGGAATCATTCCCAAGTACCTTCTCAAAGGTGTCCAGTTAGTGATATCAAGGTTGTAACTACTCAGAGTAATCAGGGAGAA C=F=EDFEFEDEADEFEGDFGGGDEFGEGGGEGGGGGGGGGGGGGGGGFFEGFGGGGGEFGGGGGFGEGGEFGGGGFGGGFFGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_984/1 65 chr4 12945395 25 91M chr2 58343178 0 GCTTCCAGTTTTTGCCCATTCAGTATGATATTGGCTGTGGGGCTGTCATAGATAGCTCTTAAAATTTTTAGATACATCCCATCCATACCTA DF?FDDBFFBFFEF=EEEEBEEDEEBD?D=CA5??>A.@C;EEE:BA>5A=DBBDD=A5DA;+=6A# XT:A:U NM:i:1 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:90A0 foo_986/1 83 chr2 114554725 60 91M = 114554339 -477 TATATCACAGATAGATCTAATTTTATAGTGGATCATAATATATTCTGTAACTGATCTTTATCACTTACGCAGCAAATATGTCCTAGATATT EGGGGFEGFGEGGGEGFEEEGGDDGEGGGGEFGGGGGFGGGGGEGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:55T35 foo_986/2 163 chr2 114554339 60 91M = 114554725 477 CGCTCATGAACAACTAACTTATTTCTCACAGTTCTGGAGGTTGCTATGTCCAAGATGAAGGCACCAGCAGATTTGGCATCTGGTGAGGGCC GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGEGGGGEGGGGGGGGGGFGGGGGGGGGGGGG=EDFFFFGGEGFEEEBDEEAC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_988/1 99 chr7 130338986 60 91M = 130339344 449 TTCCTATTTAAACAGTGTAAGAATTGGGGGTGGCATTTGAAGGTGGGAGAAGAAGGGCAAACTTGTTAGTTTACTAGTGTGAGCGGTAGAT F=FFFFBFEFFFFDFDEDEEFFEFFEEEEEBB@@DDBDDDFFFBFDAA?ADECBC:C=EEBEDE:;>A=>FFFEE5A@CCFF:EDEBEEBE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_988/2 147 chr7 130339344 60 91M = 130338986 -449 GCCCAGGCTGGAGTACAAAACATTGCACATATTTTTTTCAATGGTTTACTCATACTTGATGAGAGAGCTTCTATTATATTTAATTTTTTGT FFBDFABBC5EEFFBFE?BGGFFGGDFFGGGEFFFDDFFGGFDDDBFFEFEFGGGGFBDGGFFGGFDFGGGGFFGGGFGGGD>FFGGGGEG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_990/1 83 chr3 61494141 60 91M = 61493746 -486 AATCGCACATATCATCACGGCCACATTCATTTACAGTGAGTGATTAAGTTTGTCCCACACTCAAGGGGAGGAGAATTAGTCTCCACCTTTT B==?>=???@?=C@:C:EECEFCFEBDFEBBDDDFGEGEGFBGDGGFGGBFGEFGGEGEGFFGFGGGGGGGGFGGGGGGEGGGGEGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_990/2 163 chr3 61493746 60 91M = 61494141 486 TGAAACTGGCTTTCTCCCAGAGTGACTGATCCAAACACAGTTAGGTGAAGGTGTCAATATGCTTTACAATACAGTCGGGGGCAGTGGCTCC GFGGGGGDGGGGGGGGGGFGFFEFFFFFFFGGGGAGGFGFEEEEE5DDDDA5AAAEEEEEGD=GGFGDGGFA?GBCCEC############ XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:79T10A0 foo_992/1 99 chr8 63749236 60 91M = 63749604 459 TCTTAGCAAACACACACAAATGCCTGCAATCTGAACCTCCTCCACCTATCGCTCTGGCCACATCCAGACATATGGAGGAACTCAAAATTCC GGFGGGGGGGGGGGFGGGGFGDGFDFDFFFGGGCFGGGGEGGGEGGGD?GEFFFEDAEFEEFBDFFEGGGGBEEEFFD?EFFFAFBFDDDF XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_992/2 147 chr8 63749604 60 91M = 63749236 -459 ATGAATGATTGCAGGTATCCTGCTTGGGGATGGGGGCGAGAGAATGGTATCACTGTTGTTAGAAACACTAAACGTAATAAATGGCAAGGAA E:CEB?FFAFFC5ECE?5CBEGGGCGGGFDEGEGGFGGDGFGFGFGEEDEDEGGGGGGFGDGGDFGD?FFDDGGGGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_994/1 99 chr3 13832430 60 91M = 13832790 451 ATGAAGAGGCCCACATTCCCATGTGGCAAAAATGCCCTTGTGTTGAGCAGCTTACTTTCTGAGACTGGGTTCCTGCCAAGACCCCAGGCAT GGGGGGGGGGGGGGGGGGGGGGGGGGBGGFGGBFFGGGFGGGDGGGGEGGGGDGGGGEGGGGGGGGFCFCGGBE=EGDGEE5EEEFFFFFB XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_994/2 147 chr3 13832790 60 91M = 13832430 -451 AGTCATTACTGGGAGGATCCTCACAACAGCCCTGTGAGGTAGGTGTTATTCTGATTTCACAGGTGGAGAGACTGGGGCTCAGAGCACAGCT FEBCB?FEGGGGGGGECCE?BGDFGBFFFDBGGGGEGGGFFGGFGFGGGGGEGGGGGFFAGGGGGGGGGGEGDGGGGGFGGFFFF?EEEEE XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:69A21 foo_996/1 65 chr3 13832430 25 91M * 0 0 ATGAAGAGGCCCACATTCCCATGTGGCAAAAAAGCCCTTGGGTTGAGCAGCTTACTTTCTGAGACTGGGTTAGTGCCAAGACCCCAGGCAT ?########################################################################################## XT:A:U NM:i:4 SM:i:25 AM:i:0 X0:i:1 X1:i:0 XM:i:4 XO:i:0 XG:i:0 MD:Z:32T7T30C0C18 foo_997/1 99 chr15 67761720 60 91M = 67762092 463 ATAAGACTTGAAAGTTGAAATTACTCTTTGAGCTAAGGGCTGAGATTGGATGTTGTGTTAGCAGGCATGAAAACAACATCAATTGTACATC GGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGFGGFGGGGGGDEGGGGFGCGGDDFFBECEEFEEECBCDC=:@=??CEE=EDEBEEEC XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 foo_997/2 147 chr15 67762092 60 91M = 67761720 -463 CCCCTGATAAGAGAGTCAACCTGTCCTTTGCCATGGAAGTCAGGCATTGACTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTTTTAATTAT ?ABA@CAA?:A?AAA?=@:A@=5?CAAE:=EEBBEC:FFFF-DDCB@@@:27:;B'>@AD8>AAA0;A=:1EGGGGGFCDGGGGFGGGGGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:46C44 foo_999/1 81 chr22 31739572 37 91M * 0 0 AATTTTGAAATATACAATGCATTATTTATAATGCATTATAGTGACTGTAAAGTCACTATTCTGTGCAAAAGATCACAAGGGCTTATCTCTC FEEEDDEGGFGEGEGGEAEDEEEEGEEGFGEEFBEGEFEGGGGEFEGFGGGGEGEGGGGGGGGGGGGGGGGGGGFGGGGGGGGFGEGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 ================================================ FILE: examples/pydoop_script/data/stop_words.txt ================================================ one two three ================================================ FILE: examples/pydoop_script/data/transpose_input/matrix.txt ================================================ a00 a01 a02 a10 a11 a12 a20 a21 a22 a30 a31 a32 a40 a41 a42 ================================================ FILE: examples/pydoop_script/run ================================================ #!/usr/bin/env bash set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" for s in base_histogram caseswitch grep grep_compiled lowercase transpose wc_combiner wordcount wordcount_sw; do bash "${this_dir}"/run_script.sh ${s} done ================================================ FILE: examples/pydoop_script/run_script.sh ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" # Use NLineInputFormat to force multiple mappers with a single input file NL_INPUT_FORMAT="org.apache.hadoop.mapreduce.lib.input.NLineInputFormat" nargs=1 if [ $# -ne ${nargs} ]; then die "Usage: $0 prog" fi prog=$1 OPTS=( "-D" "mapreduce.job.name=${prog}" "-D" "mapreduce.task.timeout=10000" ) [ -n "${DEBUG:-}" ] && OPTS+=( "--log-level" "DEBUG" ) case ${prog} in base_histogram ) DATA="${this_dir}/data/base_histogram_input" ;; transpose ) DATA="${this_dir}/data/transpose_input" OPTS+=( "--num-reducers" "4" "--input-format" "${NL_INPUT_FORMAT}") ;; wordcount ) DATA="${this_dir}/../input" OPTS+=( "--num-reducers" "2" ) ;; wordcount_sw ) DATA="${this_dir}/../input" OPTS+=( "--num-reducers" "2" ) OPTS+=( "--upload-file-to-cache" "${this_dir}/data/stop_words.txt" ) ;; wc_combiner ) DATA="${this_dir}/../input" OPTS+=( "--num-reducers" "2" "-c" "combiner" ) ;; *) DATA="${this_dir}/../input" OPTS+=( "--num-reducers" "0" "-t" "" ) case ${prog} in caseswitch ) OPTS+=( "-D" "caseswitch.case=upper" ) ;; grep | grep_compiled ) OPTS+=( "-D" "grep-expression=March" ) ;; esac esac WD=$(mktemp -d) if [ ${prog} == grep_compiled ]; then src="${this_dir}"/scripts/grep.py script="${WD}"/grep.pyc ${PYTHON} -c "from py_compile import compile; compile('${src}', cfile='${script}')" else script="${this_dir}"/scripts/${prog}.py fi if [ "$(hadoop_fs)" != "file" ]; then ensure_dfs_home INPUT="input" OUTPUT="output" ${HDFS} dfs -rm -r -f "${INPUT}" "${OUTPUT}" ${HDFS} dfs -put "${DATA}" "${INPUT}" else INPUT="${DATA}" OUTPUT="${WD}/output" fi ${PYDOOP} script "${OPTS[@]}" "${script}" "${INPUT}" "${OUTPUT}" ${PYTHON} "${this_dir}"/check.py ${prog} "${OUTPUT}" rm -rf "${WD}" ================================================ FILE: examples/pydoop_script/scripts/base_histogram.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Count the base frequency in sequencing data (in SAM format). input: file in SAM format output: tab-separated (base, count) pairs """ def mapper(_, samrecord, writer): seq = samrecord.split("\t", 10)[9] for c in seq: writer.emit(c, 1) writer.count("bases", len(seq)) def reducer(key, ivalue, writer): writer.emit(key, sum(ivalue)) ================================================ FILE: examples/pydoop_script/scripts/caseswitch.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Convert text to upper or lower case. By default, the program will switch text to upper case. Set the config property 'caseswitch.case=lower' if you prefer to switch to lower case. Set --kv-separator to the empty string when running this example. """ def mapper(_, record, writer, conf): if conf['caseswitch.case'] == 'upper': value = record.upper() elif conf['caseswitch.case'] == 'lower': value = record.lower() else: raise RuntimeError( "Invalid caseswitch value %s" % conf['caseswitch.case'] ) writer.emit("", value) ================================================ FILE: examples/pydoop_script/scripts/grep.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Emit strings that contain the substring provided by the property 'grep-expression' (script raises an exception if the property is missing). We use the fourth 'conf' argument to retrieve the custom 'grep-expression' parameter. When running this example, set --kv-separator to the empty string and --num-reducers 0. """ # DOCS_INCLUDE_START def mapper(_, text, writer, conf): if text.find(conf['grep-expression']) >= 0: writer.emit("", text) ================================================ FILE: examples/pydoop_script/scripts/lowercase.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Convert text to lowercase. Set --kv-separator to the empty string when running this example. """ # DOCS_INCLUDE_START def mapper(_, record, writer): writer.emit("", record.lower()) ================================================ FILE: examples/pydoop_script/scripts/transpose.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ Transpose a tab-separated text matrix. pydoop script transpose.py matrix.txt t_matrix hadoop fs -get t_matrix{,} sort -mn -k1,1 -o t_matrix.txt t_matrix/part-0000* t_matrix.txt contains an additional first column with row indexes -- this might not be a problem if it acts as input for another job. How does it work? Suppose you want to transpose the following matrix: a00 a01 a02 a10 a11 a12 We can set the intermediate key to the column index to have the framework automatically regroup elements by column. We also have to send the row index: the reducer will need it to sort each output row. Although we don't know the global row index for a given input record, we can use the input key, which is equal to the global byte count (with the default TextInputFormat). The key/value stream emitted by the mappers looks like: 0, (0, 'a00') 1, (0, 'a01') 2, (0, 'a02') 0, (12, 'a10') 1, (12, 'a11') 2, (12, 'a12') And reducers will get: 0, [(0, 'a00'), (12, 'a10')] 2, [(0, 'a02'), (12, 'a12')] 1, [(12, 'a11'), (0, 'a01')] Writing out the key (i.e., the output row index) together with the value allows to put the output rows in the correct order. """ def mapper(key, value, writer): # work around pipes' current limitation with explicit input formats try: value = value.decode("ascii") except AttributeError: pass for i, a in enumerate(value.split()): writer.emit(i, (key, a)) def reducer(key, ivalue, writer): row = [_[1] for _ in sorted(ivalue)] writer.emit(key, "\t".join(row)) ================================================ FILE: examples/pydoop_script/scripts/wc_combiner.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ Word count with combiner. """ def mapper(_, text, writer): for word in text.split(): writer.emit(word, 1) def reducer(word, icounts, writer): writer.emit(word, sum(icounts)) # DOCS_INCLUDE_START def combiner(word, icounts, writer): writer.count('combiner calls', 1) reducer(word, icounts, writer) ================================================ FILE: examples/pydoop_script/scripts/wordcount.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ Pydoop script version of the word count example. """ # DOCS_INCLUDE_START def mapper(_, text, writer): for word in text.split(): writer.emit(word, 1) def reducer(word, icounts, writer): writer.emit(word, sum(icounts)) ================================================ FILE: examples/pydoop_script/scripts/wordcount_sw.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ Word count with stop words (i.e., words that should be ignored). """ # DOCS_INCLUDE_START STOP_WORDS_FN = 'stop_words.txt' try: with open(STOP_WORDS_FN) as f: STOP_WORDS = frozenset(l.strip() for l in f if not l.isspace()) except OSError: STOP_WORDS = frozenset() def mapper(_, value, writer): for word in value.split(): if word in STOP_WORDS: writer.count("STOP_WORDS", 1) else: writer.emit(word, 1) def reducer(word, icounts, writer): writer.emit(word, sum(icounts)) ================================================ FILE: examples/pydoop_submit/check.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import os import argparse from ast import literal_eval import pydoop.test_support as pts import pydoop.hadut as hadut import pydoop.hdfs as hdfs THIS_DIR = os.path.dirname(os.path.abspath(__file__)) DEFAULT_INPUT_DIR = os.path.join(THIS_DIR, os.pardir, "input") CHECKS = [ "nosep", "wordcount_minimal", "wordcount_full", "map_only_java_writer", "map_only_python_writer", ] def check_wordcount_minimal(mr_out_dir): output = hadut.collect_output(mr_out_dir) local_wc = pts.LocalWordCount(DEFAULT_INPUT_DIR) res = local_wc.check(output) return res.startswith("OK") # FIXME: change local_wc to raise an exception check_wordcount_full = check_wordcount_minimal def check_nosep(mr_out_dir): output = [] for fn in hadut.iter_mr_out_files(mr_out_dir): with hdfs.open(fn, "rt") as f: for line in f: output.append(line.rstrip()) exp_output = [] in_dir = os.path.join(THIS_DIR, "data") for name in os.listdir(in_dir): with open(os.path.join(in_dir, name)) as f: exp_output.extend(["".join(_.rstrip().split()) for _ in f]) return sorted(exp_output) == sorted(output) def check_map_only_python_writer(mr_out_dir): output = [] for fn in hadut.iter_mr_out_files(mr_out_dir): with hdfs.open(fn, "rt") as f: for line in f: try: t, rec = line.rstrip().split("\t", 1) except ValueError: t, rec = line.rstrip(), "" output.append((literal_eval(t), rec)) output = [_[1] for _ in sorted(output)] exp_output = [] for name in sorted(os.listdir(DEFAULT_INPUT_DIR)): with open(os.path.join(DEFAULT_INPUT_DIR, name)) as f: exp_output.extend([_.rstrip().upper() for _ in f]) return exp_output == output check_map_only_java_writer = check_map_only_python_writer if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("name", metavar="NAME", choices=CHECKS, help="one of: %s" % "; ".join(CHECKS)) parser.add_argument("mr_out", metavar="DIR", help="MapReduce out dir") args = parser.parse_args(sys.argv[1:]) check = globals()["check_%s" % args.name] if check(args.mr_out): print("OK.") else: sys.exit("ERROR: output differs from the expected one") ================================================ FILE: examples/pydoop_submit/data/cols_1.txt ================================================ foo1 bar1 foo2 bar2 ================================================ FILE: examples/pydoop_submit/data/cols_2.txt ================================================ foo3 bar3 foo4 bar4 ================================================ FILE: examples/pydoop_submit/mr/map_only_java_writer.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import os import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes class Mapper(api.Mapper): def __init__(self, context): self.name = os.path.basename(context.input_split.filename) def map(self, context): context.emit((self.name, context.key), context.value.upper()) def __main__(): pipes.run_task(pipes.Factory(Mapper)) if __name__ == "__main__": __main__() ================================================ FILE: examples/pydoop_submit/mr/map_only_python_writer.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import logging logging.basicConfig() LOGGER = logging.getLogger("MapOnly") LOGGER.setLevel(logging.INFO) import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes import pydoop.hdfs as hdfs class Mapper(api.Mapper): def __init__(self, context): self.name = hdfs.path.basename(context.input_split.filename) def map(self, context): context.emit((self.name, context.key), context.value.upper()) class Writer(api.RecordWriter): def __init__(self, context): super(Writer, self).__init__(context) self.logger = LOGGER.getChild("Writer") jc = context.job_conf outfn = context.get_default_work_file() self.logger.info("writing to %s", outfn) hdfs_user = jc.get("pydoop.hdfs.user", None) self.sep = jc.get("mapreduce.output.textoutputformat.separator", "\t") self.file = hdfs.open(outfn, "wt", user=hdfs_user) def close(self): self.file.close() self.file.fs.close() def emit(self, key, value): self.file.write("%r%s%s%s" % (key, self.sep, value, "\n")) def __main__(): pipes.run_task(pipes.Factory(Mapper, record_writer_class=Writer)) if __name__ == "__main__": __main__() ================================================ FILE: examples/pydoop_submit/mr/nosep.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pp class Mapper(api.Mapper): def map(self, ctx): p = ctx.value.strip().split('\t') ctx.emit(p[0], p[1]) def __main__(): pp.run_task(pp.Factory(Mapper, None)) if __name__ == "__main__": __main__() ================================================ FILE: examples/pydoop_submit/mr/wordcount_full.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import logging logging.basicConfig() LOGGER = logging.getLogger("WordCount") LOGGER.setLevel(logging.INFO) from hashlib import md5 import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes import pydoop.hdfs as hdfs class Mapper(api.Mapper): def __init__(self, context): super(Mapper, self).__init__(context) context.set_status("initializing mapper") self.input_words = context.get_counter("WORDCOUNT", "INPUT_WORDS") def map(self, context): words = context.value.split() for w in words: context.emit(w, 1) context.increment_counter(self.input_words, len(words)) class Reducer(api.Reducer): def __init__(self, context): super(Reducer, self).__init__(context) context.set_status("initializing reducer") self.output_words = context.get_counter("WORDCOUNT", "OUTPUT_WORDS") def reduce(self, context): context.emit(context.key, sum(context.values)) context.increment_counter(self.output_words, 1) class Reader(api.RecordReader): """ Mimics Hadoop's default LineRecordReader (keys are byte offsets with respect to the whole file; values are text lines). """ def __init__(self, context): super(Reader, self).__init__(context) self.logger = LOGGER.getChild("Reader") self.logger.debug('started') self.isplit = context.input_split for a in "filename", "offset", "length": self.logger.debug( "isplit.{} = {}".format(a, getattr(self.isplit, a)) ) self.file = hdfs.open(self.isplit.filename) self.file.seek(self.isplit.offset) self.bytes_read = 0 if self.isplit.offset > 0: discarded = self.file.readline() self.bytes_read += len(discarded) def close(self): self.logger.debug("closing open handles") self.file.close() self.file.fs.close() def next(self): if self.bytes_read > self.isplit.length: raise StopIteration key = self.isplit.offset + self.bytes_read record = self.file.readline() if not record: # end of file raise StopIteration self.bytes_read += len(record) return (key, record.decode("utf-8")) def get_progress(self): return min(float(self.bytes_read) / self.isplit.length, 1.0) class Writer(api.RecordWriter): def __init__(self, context): super(Writer, self).__init__(context) self.logger = LOGGER.getChild("Writer") jc = context.job_conf outfn = context.get_default_work_file() self.logger.info("writing to %s", outfn) hdfs_user = jc.get("pydoop.hdfs.user", None) self.file = hdfs.open(outfn, "wt", user=hdfs_user) self.sep = jc.get("mapreduce.output.textoutputformat.separator", "\t") def close(self): self.logger.debug("closing open handles") self.file.close() self.file.fs.close() def emit(self, key, value): self.file.write(key + self.sep + str(value) + "\n") class Partitioner(api.Partitioner): def __init__(self, context): super(Partitioner, self).__init__(context) self.logger = LOGGER.getChild("Partitioner") def partition(self, key, n_reduces): reducer_id = int(md5(key).hexdigest(), 16) % n_reduces self.logger.debug("reducer_id: %r" % reducer_id) return reducer_id # DOCS_INCLUDE_START FACTORY = pipes.Factory( Mapper, reducer_class=Reducer, record_reader_class=Reader, record_writer_class=Writer, partitioner_class=Partitioner, combiner_class=Reducer ) # DOCS_INCLUDE_END def main(): pipes.run_task(FACTORY) if __name__ == "__main__": main() ================================================ FILE: examples/pydoop_submit/mr/wordcount_minimal.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ Includes only the bare minimum required to run wordcount. See wordcount-full.py for an example that uses counters, RecordReader, etc. """ # DOCS_INCLUDE_START import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes class Mapper(api.Mapper): def map(self, context): for w in context.value.split(): context.emit(w, 1) class Reducer(api.Reducer): def reduce(self, context): context.emit(context.key, sum(context.values)) FACTORY = pipes.Factory(Mapper, reducer_class=Reducer) def main(): pipes.run_task(FACTORY) if __name__ == "__main__": main() ================================================ FILE: examples/pydoop_submit/run ================================================ #!/usr/bin/env bash set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" for s in map_only_java_writer map_only_python_writer nosep wordcount_full wordcount_minimal; do bash "${this_dir}"/run_submit.sh ${s} done bash "${this_dir}"/run_submit.sh -p wordcount_minimal_pstats wordcount_minimal ================================================ FILE: examples/pydoop_submit/run_submit.sh ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" OPTS=( "-D" "mapreduce.task.timeout=10000" "-D" "mapreduce.job.maps=2" "--python-program" "${PYTHON}" ) while getopts ":p:" opt; do case ${opt} in p ) OPTS+=( "--pstats-dir" "${OPTARG}" ) OPTS+=( "--pstats-fmt" "_test_%s_%05d_%s" ) ;; \? ) echo "Invalid option: -${OPTARG}" >&2 exit 1 ;; : ) echo "Option -${OPTARG} requires an argument" >&2 exit 1 ;; esac done shift $((${OPTIND} - 1)) nargs=1 if [ $# -ne ${nargs} ]; then die "Usage: $0 [-p PSTATS_DIR] MODULE_NAME" fi MODULE=$1 APP_DIR="${this_dir}/mr" JOBNAME=${MODULE} RESULTS=results.txt OPTS+=( "--job-name" "${JOBNAME}" ) case ${MODULE} in wordcount_minimal ) DATA="${this_dir}"/../input OPTS+=("--entry-point" "main") ;; wordcount_full ) DATA="${this_dir}"/../input OPTS+=("--entry-point" "main") OPTS+=( "--do-not-use-java-record-reader" ) OPTS+=( "--do-not-use-java-record-writer" ) OPTS+=( "-D" "pydoop.hdfs.user=${USER}" ) ;; nosep ) DATA="${this_dir}"/data OPTS+=( "--num-reducers" "0" ) OPTS+=( "--output-format" "it.crs4.pydoop.NoSeparatorTextOutputFormat" ) ;; map_only_java_writer ) DATA="${this_dir}"/../input OPTS+=( "--num-reducers" "0" ) ;; map_only_python_writer ) DATA="${this_dir}"/../input OPTS+=( "--num-reducers" "0" ) OPTS+=( "--do-not-use-java-record-writer" ) ;; esac OPTS+=( "--upload-file-to-cache" "${APP_DIR}/${MODULE}.py" ) [ -n "${DEBUG:-}" ] && OPTS+=( "--log-level" "DEBUG" ) WD=$(mktemp -d) if [ "$(hadoop_fs)" != "file" ]; then ensure_dfs_home INPUT="input" OUTPUT="output" ${HDFS} dfs -rm -r -f "${INPUT}" "${OUTPUT}" ${HDFS} dfs -put "${DATA}" "${INPUT}" else INPUT="${DATA}" OUTPUT="${WD}/output" fi ${PYDOOP} submit "${OPTS[@]}" ${MODULE} "${INPUT}" "${OUTPUT}" ${PYTHON} "${this_dir}"/check.py ${MODULE} "${OUTPUT}" rm -rf "${WD}" ================================================ FILE: examples/run_all ================================================ #!/bin/bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/config.sh" trap exit ERR examples=( hdfs input_format pydoop_script pydoop_submit self_contained sequence_file ) some_failed=0 for e in ${examples[@]}; do pushd ${e} echo -ne "\n\n *** RUNNING ${e} EXAMPLE(S) ***\n\n" ./run exit_code=$? if [ ${exit_code} -ne 0 ]; then echo -ne "\n\n #### Error!! Example ${e} finished with code ${exit_code} ###\n\n" >&2 some_failed=1 fi popd done if [ ${some_failed} -ne 0 ]; then echo "##############################################" >&2 echo "Some examples failed to run correctly. Please" >&2 echo "verify your installation" >&2 echo "##############################################" >&2 fi ================================================ FILE: examples/self_contained/check_results.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import re import logging from collections import Counter logging.basicConfig(level=logging.INFO) import pydoop.hdfs as hdfs import pydoop.test_support as pts import pydoop.hadut as hadut def compute_vc(input_dir): data = [] for path in hdfs.ls(input_dir): with hdfs.open(path, 'rt') as f: data.append(f.read()) all_data = ''.join(data) vowels = re.findall('[AEIOUY]', all_data.upper()) return Counter(vowels) def get_res(output_dir): return pts.parse_mr_output(hadut.collect_output(output_dir), vtype=int) def check(measured_res, expected_res): res = pts.compare_counts(measured_res, expected_res) if res: return "ERROR: %s" % res else: return "OK." def main(argv): logger = logging.getLogger("main") logger.setLevel(logging.INFO) input_dir = argv[1] output_dir = argv[2] logger.info("checking results") measured_res = get_res(output_dir) expected_res = compute_vc(input_dir) logger.info(check(measured_res, expected_res)) if __name__ == "__main__": main(sys.argv) ================================================ FILE: examples/self_contained/run ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" old_pwd=$(pwd) wd=$(mktemp -d) vc_tar="${wd}"/vowelcount.tgz pydoop_tar="${wd}"/pydoop.tgz cd "${this_dir}/../.." ${PYTHON} -m pip install --pre -t "${wd}" . cd "${wd}/pydoop" tar cfz "${pydoop_tar}" . cd "${this_dir}/vowelcount" tar czf "${vc_tar}" . if [ "$(hadoop_fs)" != "file" ]; then ensure_dfs_home input="input" output="output" ${HDFS} dfs -rm -r -f "${input}" "${output}" ${HDFS} dfs -put "${this_dir}/../input" "${input}" else input="${this_dir}/../input" output="${wd}/output" fi opts=( "--python-zip" "${vc_tar}" "--upload-archive-to-cache" "${pydoop_tar}" "--job-name" "self_contained" "--entry-point" "main" "--no-override-home" "--no-override-env" "-D" "mapreduce.task.timeout=10000" ) [ -n "${DEBUG:-}" ] && opts+=( "--log-level" "DEBUG" ) ${PYDOOP} submit "${opts[@]}" vowelcount.mr.main "${input}" "${output}" ${PYTHON} "${this_dir}/check_results.py" "${input}" "${output}" cd ${old_pwd} rm -rf "${wd}" ================================================ FILE: examples/self_contained/vowelcount/__init__.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ A trivial MapReduce application that counts the occurence of each vowel in a text input stream. It is more structured than would be necessary because we want to test automatic distribution of a package rather than a single module. """ ================================================ FILE: examples/self_contained/vowelcount/lib/__init__.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT # DOCS_INCLUDE_START _VOWELS = set("AEIOUYaeiouy") def is_vowel(c): return c in _VOWELS ================================================ FILE: examples/self_contained/vowelcount/mr/__init__.py ================================================ ================================================ FILE: examples/self_contained/vowelcount/mr/main.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from pydoop.mapreduce.pipes import run_task, Factory from .mapper import Mapper from .reducer import Reducer def main(): return run_task(Factory(Mapper, Reducer, combiner_class=Reducer)) ================================================ FILE: examples/self_contained/vowelcount/mr/mapper.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.mapreduce.api as api from vowelcount.lib import is_vowel class Mapper(api.Mapper): def map(self, context): for c in context.value: if is_vowel(c): context.emit(c.upper(), 1) ================================================ FILE: examples/self_contained/vowelcount/mr/reducer.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.mapreduce.api as api class Reducer(api.Reducer): def reduce(self, context): s = sum(context.values) context.emit(context.key, s) ================================================ FILE: examples/sequence_file/bin/filter.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Filter out words whose occurrence falls below a specified value. """ import struct from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper class FilterMapper(Mapper): """ Process a wordcount output stream, emitting only records relative to words whose count is equal to or above the configured threshold. """ def __init__(self, context): super(FilterMapper, self).__init__(context) jc = context.job_conf self.threshold = jc.get_int("filter.occurrence.threshold") def map(self, context): word, occurrence = context.key, context.value occurrence = struct.unpack(">i", occurrence)[0] if occurrence >= self.threshold: context.emit(word, str(occurrence)) def __main__(): factory = Factory(FilterMapper) run_task(factory, raw_values=True) ================================================ FILE: examples/sequence_file/bin/wordcount.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import struct from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper, Reducer class WordCountMapper(Mapper): def map(self, context): for w in context.value.split(): context.emit(w, 1) class WordCountReducer(Reducer): def reduce(self, context): s = sum(context.values) context.emit(context.key.encode("utf-8"), struct.pack(">i", s)) def __main__(): factory = Factory(WordCountMapper, WordCountReducer) run_task(factory, auto_serialize=False) ================================================ FILE: examples/sequence_file/check.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import argparse import pydoop.test_support as pts import pydoop.hadut as hadut def main(args): output = hadut.collect_output(args.output) local_wc = pts.LocalWordCount(args.input, min_occurrence=args.threshold) res = local_wc.check(output) if res.startswith("OK"): # FIXME: change local_wc to raise an exception print("OK.") else: raise RuntimeError("output differs from the expected one") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", metavar="INPUT_DIR") parser.add_argument("output", metavar="OUTPUT_DIR") parser.add_argument("-t", "--threshold", type=int, metavar="INT", help="min word occurrence", default=10) main(parser.parse_args()) ================================================ FILE: examples/sequence_file/run ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT # This example shows how to use Hadoop's SequenceFile input and output formats # with Pydoop. First we run a word count on the input, storing counts as # 32-bit integers in Hadoop SequenceFiles; next we run a MapReduce application # that filters out those words whose count falls below a specified threshold. set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" local_input="${this_dir}/../input" occurrence_threshold=10 opts=( "--python-program" "${PYTHON}" "-D" "mapreduce.task.timeout=10000" ) [ -n "${DEBUG:-}" ] && OPTS+=( "--log-level" "DEBUG" ) run_wc() { local input=$1 local output=$2 local opts=( "${opts[@]}" ) opts+=( "--job-name" "wordcount" "--num-reducers" "2" "--output-format" "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat" "--upload-file-to-cache" "${this_dir}/bin/wordcount.py" "-D" "mapreduce.output.fileoutputformat.compress.type=NONE" ) ${PYDOOP} submit "${opts[@]}" wordcount "${input}" "${output}" } run_filter() { local input=$1 local output=$2 local opts=( "${opts[@]}" ) opts+=( "--job-name" "filter" "--num-reducers" "0" "--input-format" "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat" "--upload-file-to-cache" "${this_dir}/bin/filter.py" "-D" "filter.occurrence.threshold=${occurrence_threshold}" ) ${PYDOOP} submit "${opts[@]}" filter "${input}" "${output}" } wd=$(mktemp -d) if [ "$(hadoop_fs)" != "file" ]; then ensure_dfs_home input="input" wc_output="wc_output" filter_output="filter_output" ${HDFS} dfs -rm -r -f "${input}" "${wc_output}" "${filter_output}" ${HDFS} dfs -put "${local_input}" "${input}" else input="${local_input}" wc_output="${wd}/wc_output" filter_output="${wd}/filter_output" fi run_wc "${input}" "${wc_output}" run_filter "${wc_output}" "${filter_output}" ${PYTHON} "${this_dir}/check.py" "${local_input}" "${filter_output}" -t ${occurrence_threshold} rm -rf "${wd}" ================================================ FILE: int_test/config.sh ================================================ [ -n "${PYDOOP_INT_TESTS:-}" ] && return || readonly PYDOOP_INT_TESTS=1 die() { echo $1 1>&2 exit 1 } export USER="${USER:-$(whoami)}" export HADOOP="${HADOOP:-hadoop}" export HDFS="${HDFS:-hdfs}" export MAPRED="${MAPRED:-mapred}" export YARN="${YARN:-yarn}" export PYTHON="${PYTHON:-python}" export PY_VER=$("${PYTHON}" -c 'import sys; print(sys.version_info[0])') export PYDOOP="pydoop${PY_VER}" ensure_dfs_home() { ${HDFS} dfs -mkdir -p /user/${USER} } hadoop_fs() { ${HDFS} getconf -confKey fs.defaultFS | cut -d : -f 1 } export -f die ensure_dfs_home hadoop_fs ================================================ FILE: int_test/mapred_submitter/check.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import argparse import io import os import pstats import sys from collections import Counter from itertools import chain def get_lines(dir_path): rval = [] for name in sorted(os.listdir(dir_path)): path = os.path.join(dir_path, name) if not os.path.isdir(path): with io.open(path, "rt") as f: for line in f: rval.append(line.rstrip()) return rval def check_output(items, exp_items): if len(items) != len(exp_items): raise RuntimeError("n. output items = %d (expected: %d)" % ( len(items), len(exp_items) )) for i, (it, exp_it) in enumerate(zip(items, exp_items)): if it != exp_it: raise RuntimeError("wrong output item #%d: %r (expected: %r)" % ( i, it, exp_it )) def check_counters(counter, exp_counter): return check_output(sorted(counter.items()), sorted(exp_counter.items())) def word_count(lines): return Counter(chain(*(_.split() for _ in lines))) def check_map_only(in_dir, out_dir): uc_lines = [_.upper() for _ in get_lines(in_dir)] out_values = [_.split("\t", 1)[1] for _ in get_lines(out_dir)] check_output(out_values, uc_lines) def check_map_reduce(in_dir, out_dir): wc = word_count(get_lines(in_dir)) out_pairs = (_.split("\t", 1) for _ in get_lines(out_dir)) out_wc = {k: int(v) for k, v in out_pairs} check_counters(out_wc, wc) def check_pstats(pstats_dir): pstats_names = os.listdir(pstats_dir) try: bn = pstats_names[0] except IndexError: raise RuntimeError("%r is empty" % (pstats_dir,)) pstats.Stats(os.path.join(pstats_dir, bn)) CHECKS = { "map_only_java_writer": check_map_only, "map_only_python_writer": check_map_only, "map_reduce_combiner": check_map_reduce, "map_reduce_java_rw": check_map_reduce, "map_reduce_java_rw_pstats": check_map_reduce, "map_reduce_python_partitioner": check_map_reduce, "map_reduce_python_reader": check_map_reduce, "map_reduce_python_writer": check_map_reduce, "map_reduce_raw_io": check_map_reduce, "map_reduce_slow_java_rw": check_map_reduce, "map_reduce_slow_python_rw": check_map_reduce, } if __name__ == "__main__": choices = sorted(CHECKS) parser = argparse.ArgumentParser() parser.add_argument("name", metavar="NAME", choices=choices, help="one of: %s" % "; ".join(choices)) parser.add_argument("mr_in", metavar="IN_DIR", help="MapReduce in dir") parser.add_argument("mr_out", metavar="OUT_DIR", help="MapReduce out dir") args = parser.parse_args(sys.argv[1:]) check = CHECKS[args.name] check(args.mr_in, args.mr_out) if "pstats" in args.name: check_pstats("%s.stats" % args.mr_out) sys.stdout.write("OK\n") ================================================ FILE: int_test/mapred_submitter/genwords.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import argparse import io import os import sys from random import choice POOL = b"""\ lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum """.splitlines(True) def genfile(path, size): current_size = 0 with io.open(path, "wb") as f: while current_size < size: line = choice(POOL) f.write(line) current_size += len(line) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("out_dir", metavar="OUT_DIR") parser.add_argument("--n-files", metavar="INT", type=int, default=2) parser.add_argument("--file-size", metavar="BYTES", type=int, default=1000) args = parser.parse_args(sys.argv[1:]) os.makedirs(args.out_dir) for i in range(args.n_files): path = os.path.join(args.out_dir, "f%d.txt" % i) genfile(path, args.file_size) ================================================ FILE: int_test/mapred_submitter/input/map_only/f1.txt ================================================ line1 line2 ================================================ FILE: int_test/mapred_submitter/input/map_only/f2.txt ================================================ line3 line4 ================================================ FILE: int_test/mapred_submitter/input/map_reduce/f1.txt ================================================ the quick brown fox had a meeting with the lazy red FӦX ================================================ FILE: int_test/mapred_submitter/input/map_reduce/f2.txt ================================================ the young black FӦX had breakfast with the old pink fox ================================================ FILE: int_test/mapred_submitter/input/map_reduce_long/f.txt ================================================ we need more than ten lines because we are setting the timeout to ten seconds and the map and reduce functions sleep for one seconds before each emit if things are working the timeout will reset at each emit and the job will complete if not the job will crash ================================================ FILE: int_test/mapred_submitter/mr/map_only_java_writer.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes class Mapper(api.Mapper): def map(self, context): context.emit(context.key, context.value.upper()) def __main__(): pipes.run_task(pipes.Factory(Mapper)) if __name__ == "__main__": __main__() ================================================ FILE: int_test/mapred_submitter/mr/map_only_python_writer.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.hdfs as hdfs import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes SEP_KEY = "mapreduce.output.textoutputformat.separator" class Mapper(api.Mapper): def map(self, context): context.emit(context.key, context.value.upper()) class Writer(api.RecordWriter): def __init__(self, context): super(Writer, self).__init__(context) outfn = context.get_default_work_file() self.file = hdfs.open(outfn, "wt") self.sep = context.job_conf.get(SEP_KEY, "\t") def close(self): self.file.close() def emit(self, key, value): self.file.write(str(key) + self.sep + value + "\n") def __main__(): pipes.run_task(pipes.Factory(Mapper, record_writer_class=Writer)) if __name__ == "__main__": __main__() ================================================ FILE: int_test/mapred_submitter/mr/map_reduce_combiner.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes class Mapper(api.Mapper): def map(self, context): for w in context.value.split(): context.emit(w, 1) class Reducer(api.Reducer): def reduce(self, context): context.emit(context.key, sum(context.values)) def __main__(): pipes.run_task(pipes.Factory( Mapper, combiner_class=Reducer, reducer_class=Reducer, )) if __name__ == "__main__": __main__() ================================================ FILE: int_test/mapred_submitter/mr/map_reduce_java_rw.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes class Mapper(api.Mapper): def map(self, context): for w in context.value.split(): context.emit(w, 1) class Reducer(api.Reducer): def reduce(self, context): context.emit(context.key, sum(context.values)) def __main__(): pipes.run_task(pipes.Factory(Mapper, reducer_class=Reducer)) if __name__ == "__main__": __main__() ================================================ FILE: int_test/mapred_submitter/mr/map_reduce_java_rw_pstats.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes class Mapper(api.Mapper): def map(self, context): for w in context.value.split(): context.emit(w, 1) class Reducer(api.Reducer): def reduce(self, context): context.emit(context.key, sum(context.values)) def __main__(): factory = pipes.Factory(Mapper, reducer_class=Reducer) pipes.run_task(factory, pstats_dir="pstats") if __name__ == "__main__": __main__() ================================================ FILE: int_test/mapred_submitter/mr/map_reduce_python_partitioner.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from hashlib import md5 import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes class Mapper(api.Mapper): def map(self, context): for w in context.value.split(): context.emit(w, 1) class Reducer(api.Reducer): def reduce(self, context): context.emit(context.key, sum(context.values)) class Partitioner(api.Partitioner): def partition(self, key, n_reduces): return int(md5(key).hexdigest(), 16) % n_reduces def __main__(): pipes.run_task(pipes.Factory( Mapper, reducer_class=Reducer, partitioner_class=Partitioner )) if __name__ == "__main__": __main__() ================================================ FILE: int_test/mapred_submitter/mr/map_reduce_python_reader.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from __future__ import division import pydoop.hdfs as hdfs import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes class Mapper(api.Mapper): def map(self, context): for w in context.value.split(): context.emit(w, 1) class Reducer(api.Reducer): def reduce(self, context): context.emit(context.key, sum(context.values)) class Reader(api.RecordReader): def __init__(self, context): super(Reader, self).__init__(context) self.split = context.input_split self.file = hdfs.open(self.split.filename) self.bytes_read = 0 if self.split.offset > 0: self.file.seek(self.split.offset) discarded = self.file.readline() # handled in previous split self.bytes_read += len(discarded) def close(self): self.file.close() def next(self): if self.bytes_read > self.split.length: raise StopIteration key = self.split.offset + self.bytes_read value = self.file.readline() if not value: # end of file raise StopIteration self.bytes_read += len(value) return key, value.decode("utf-8") def get_progress(self): return min(self.bytes_read / self.split.length, 1.0) def __main__(): pipes.run_task(pipes.Factory( Mapper, reducer_class=Reducer, record_reader_class=Reader )) if __name__ == "__main__": __main__() ================================================ FILE: int_test/mapred_submitter/mr/map_reduce_python_writer.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.hdfs as hdfs import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes SEP_KEY = "mapreduce.output.textoutputformat.separator" class Mapper(api.Mapper): def map(self, context): for w in context.value.split(): context.emit(w, 1) class Reducer(api.Reducer): def reduce(self, context): context.emit(context.key, sum(context.values)) class Writer(api.RecordWriter): def __init__(self, context): super(Writer, self).__init__(context) outfn = context.get_default_work_file() self.file = hdfs.open(outfn, "wt") self.sep = context.job_conf.get(SEP_KEY, "\t") def close(self): self.file.close() def emit(self, key, value): self.file.write(key + self.sep + str(value) + "\n") def __main__(): pipes.run_task(pipes.Factory( Mapper, reducer_class=Reducer, record_writer_class=Writer )) if __name__ == "__main__": __main__() ================================================ FILE: int_test/mapred_submitter/mr/map_reduce_raw_io.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes class Mapper(api.Mapper): # in this case there's no need to serialize/deserialize # key is not used, and bytes objects can be split just like strings def map(self, context): # key = struct.unpack(">q", context.key)[0] # value = context.value.decode("utf-8") for word in context.value.split(): context.emit(word, b"1") class Reducer(api.Reducer): def reduce(self, context): s = sum(int(_) for _ in context.values) context.emit(context.key, b"%d" % s) def __main__(): factory = pipes.Factory(Mapper, reducer_class=Reducer) pipes.run_task( factory, raw_keys=True, raw_values=True, private_encoding=False, auto_serialize=False, ) if __name__ == "__main__": __main__() ================================================ FILE: int_test/mapred_submitter/mr/map_reduce_slow_java_rw.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import time import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes class Mapper(api.Mapper): def __init__(self, context): super(Mapper, self).__init__(context) self.t0 = time.time() def map(self, context): sys.stderr.write("in: %r, %r\n" % (context.key, context.value)) time.sleep(1) for w in context.value.split(): context.emit(w, 1) def close(self): sys.stderr.write("total time: %.3f s\n" % (time.time() - self.t0)) class Reducer(api.Reducer): def __init__(self, context): super(Reducer, self).__init__(context) self.t0 = time.time() def reduce(self, context): sys.stderr.write("input key: %r\n" % (context.key,)) time.sleep(1) context.emit(context.key, sum(context.values)) def close(self): sys.stderr.write("total time: %.3f s\n" % (time.time() - self.t0)) def __main__(): pipes.run_task(pipes.Factory( Mapper, combiner_class=Reducer, reducer_class=Reducer, )) if __name__ == "__main__": __main__() ================================================ FILE: int_test/mapred_submitter/mr/map_reduce_slow_python_rw.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from __future__ import division import sys import time import pydoop.hdfs as hdfs import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes SEP_KEY = "mapreduce.output.textoutputformat.separator" class Mapper(api.Mapper): def __init__(self, context): super(Mapper, self).__init__(context) self.t0 = time.time() def map(self, context): sys.stderr.write("in: %r, %r\n" % (context.key, context.value)) time.sleep(1) for w in context.value.split(): context.emit(w, 1) def close(self): sys.stderr.write("total time: %.3f s\n" % (time.time() - self.t0)) class Reducer(api.Reducer): def __init__(self, context): super(Reducer, self).__init__(context) self.t0 = time.time() def reduce(self, context): sys.stderr.write("input key: %r\n" % (context.key,)) time.sleep(1) context.emit(context.key, sum(context.values)) def close(self): sys.stderr.write("total time: %.3f s\n" % (time.time() - self.t0)) class Reader(api.RecordReader): def __init__(self, context): super(Reader, self).__init__(context) self.split = context.input_split self.file = hdfs.open(self.split.filename) self.bytes_read = 0 if self.split.offset > 0: self.file.seek(self.split.offset) discarded = self.file.readline() # handled in previous split self.bytes_read += len(discarded) def close(self): self.file.close() def next(self): if self.bytes_read > self.split.length: raise StopIteration key = self.split.offset + self.bytes_read value = self.file.readline() if not value: # end of file raise StopIteration self.bytes_read += len(value) return key, value.decode("utf-8") def get_progress(self): return min(self.bytes_read / self.split.length, 1.0) class Writer(api.RecordWriter): def __init__(self, context): super(Writer, self).__init__(context) outfn = context.get_default_work_file() self.file = hdfs.open(outfn, "wt") self.sep = context.job_conf.get(SEP_KEY, "\t") def close(self): self.file.close() def emit(self, key, value): self.file.write(key + self.sep + str(value) + "\n") def __main__(): pipes.run_task(pipes.Factory( Mapper, reducer_class=Reducer, record_reader_class=Reader, record_writer_class=Writer )) if __name__ == "__main__": __main__() ================================================ FILE: int_test/mapred_submitter/run ================================================ #!/usr/bin/env bash set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" app_names=( map_only_java_writer map_only_python_writer map_reduce_combiner map_reduce_java_rw map_reduce_java_rw_pstats map_reduce_python_partitioner map_reduce_python_reader map_reduce_python_writer map_reduce_raw_io map_reduce_slow_java_rw map_reduce_slow_python_rw ) for name in "${app_names[@]}"; do bash "${this_dir}"/run_app.sh ${name} done ================================================ FILE: int_test/mapred_submitter/run_app.sh ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" pushd "${this_dir}" [ $# -ge 1 ] || die "Usage: $0 APP_NAME" name=$1 ${PYTHON} -c "import pydoop; pydoop.check_local_mode()" opts=( "-D" "mapreduce.job.name=${name}" "-D" "mapreduce.task.timeout=10000" ) wd=$(mktemp -d) case ${name} in map_only_java_writer ) input="input/map_only" opts+=( "-D" "mapreduce.pipes.isjavarecordreader=true" "-D" "mapreduce.pipes.isjavarecordwriter=true" "-reduces" "0" ) ;; map_only_python_writer ) input="input/map_only" opts+=( "-D" "mapreduce.pipes.isjavarecordreader=true" "-D" "mapreduce.pipes.isjavarecordwriter=false" "-reduces" "0" ) ;; map_reduce_combiner ) io_sort_mb=1 file_size=$((2 * io_sort_mb * 1024 * 1024)) input="${wd}/map_reduce_very_long" ${PYTHON} genwords.py "${input}" --file-size ${file_size} opts+=( "-D" "mapreduce.pipes.isjavarecordreader=true" "-D" "mapreduce.pipes.isjavarecordwriter=true" "-D" "mapreduce.task.io.sort.mb=${io_sort_mb}" "-reduces" "2" ) ;; map_reduce_python_reader ) input="input/map_reduce" opts+=( "-D" "mapreduce.pipes.isjavarecordreader=false" "-D" "mapreduce.pipes.isjavarecordwriter=true" "-reduces" "2" ) ;; map_reduce_python_writer ) input="input/map_reduce" opts+=( "-D" "mapreduce.pipes.isjavarecordreader=true" "-D" "mapreduce.pipes.isjavarecordwriter=false" "-reduces" "2" ) ;; map_reduce_slow_java_rw ) input="input/map_reduce_long" opts+=( "-D" "mapreduce.job.maps=1" "-D" "mapreduce.pipes.isjavarecordreader=true" "-D" "mapreduce.pipes.isjavarecordwriter=true" "-reduces" "1" ) ;; map_reduce_slow_python_rw ) input="input/map_reduce_long" opts+=( "-D" "mapreduce.job.maps=1" "-D" "mapreduce.pipes.isjavarecordreader=false" "-D" "mapreduce.pipes.isjavarecordwriter=false" "-reduces" "1" ) ;; map_reduce_* ) input="input/map_reduce" opts+=( "-D" "mapreduce.pipes.isjavarecordreader=true" "-D" "mapreduce.pipes.isjavarecordwriter=true" "-reduces" "2" ) ;; * ) rm -rf "${wd}" die "unknown app name: \"${name}\"" esac mrapp="mr/${name}.py" [ -e "${mrapp}" ] || die "\"${mrapp}\" not found" # wrap the python app with a bash layer that sets PATH cat >"${wd}/mrapp" <>"${wd}/mrapp" mrapp="${wd}/mrapp" ensure_dfs_home ${HDFS} dfs -rm -r -f "input" "output" "mrapp.py" "pstats" ${HDFS} dfs -put "${input}" "input" ${HDFS} dfs -put "${mrapp}" "mrapp.py" ${MAPRED} pipes "${opts[@]}" -program "mrapp.py" -input "input" -output "output" echo "checking results" ${HDFS} dfs -get output "${wd}/output" case "${name}" in *pstats ) ${HDFS} dfs -get pstats "${wd}/output.stats" ;; esac ${PYTHON} check.py "${name}" "${input}" "${wd}/output" rm -rf "${wd}" popd ================================================ FILE: int_test/mapred_submitter/run_perf.sh ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT # E.g., rm -f log; bash run_perf.sh map_reduce_java_rw > >(tee -a log) 2>&1 set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" pushd "${this_dir}" [ $# -ge 1 ] || die "Usage: $0 APP_NAME" name=$1 opts=( "-D" "mapreduce.job.name=${name}" "-D" "mapreduce.task.timeout=10000" "-D" "mapreduce.task.io.sort.mb=10" ) case ${name} in map_reduce_python_reader ) opts+=( "-D" "mapreduce.pipes.isjavarecordreader=false" "-D" "mapreduce.pipes.isjavarecordwriter=true" ) ;; map_reduce_python_writer ) opts+=( "-D" "mapreduce.pipes.isjavarecordreader=true" "-D" "mapreduce.pipes.isjavarecordwriter=false" ) ;; map_reduce_* ) opts+=( "-D" "mapreduce.pipes.isjavarecordreader=true" "-D" "mapreduce.pipes.isjavarecordwriter=true" ) ;; * ) die "unknown app name: \"${name}\"" esac opts+=( "-reduces" "2" ) wd=$(mktemp -d) mrapp="mr/${name}.py" [ -e "${mrapp}" ] || die "\"${mrapp}\" not found" cp "${mrapp}" "${wd}/mrapp.py" mrapp="${wd}/mrapp.py" py_exe=$(${PYTHON} -c "import sys; print(sys.executable)") sed -i "1c#!${py_exe}" "${mrapp}" input="${wd}/input" ${PYTHON} genwords.py --n-files 2 --file-size $((50 * 1024 * 1024)) "${input}" ensure_dfs_home ${HDFS} dfs -rm -r -f "input" "output" "mrapp.py" ${HDFS} dfs -put "${input}" "input" ${HDFS} dfs -put "${mrapp}" "mrapp.py" ${MAPRED} pipes "${opts[@]}" -program "mrapp.py" -input "input" -output "output" echo "checking results" ${HDFS} dfs -get output "${wd}/output" ${PYTHON} check.py "${name}" "${input}" "${wd}/output" rm -rf "${wd}" popd ================================================ FILE: int_test/opaque_split/check.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import argparse import io import os import sys from gen_splits import N_TASKS, ITEMS_PER_TASK def check_output(mr_out_dir): names = [_ for _ in os.listdir(mr_out_dir) if not _.startswith("_")] if len(names) != N_TASKS: raise RuntimeError("found %d output files (expected: %d)" % (len(names), N_TASKS)) idx = [] for n in names: path = os.path.join(mr_out_dir, n) with io.open(path, "rt") as f: lines = [_.rstrip() for _ in f] if len(lines) != ITEMS_PER_TASK: raise RuntimeError("%s has %d lines (expected: %d)" % (n, len(lines), ITEMS_PER_TASK)) idx.extend(int(_.split("\t")[0]) for _ in lines) idx.sort() # not sure order is guaranteed in a map-only job nitems = N_TASKS * ITEMS_PER_TASK if idx != list(range(nitems)): raise RuntimeError("overall indices != range(%d)" % nitems) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("mr_out", metavar="OUT_DIR", help="MapReduce out dir") args = parser.parse_args(sys.argv[1:]) check_output(args.mr_out) sys.stdout.write("OK\n") ================================================ FILE: int_test/opaque_split/gen_splits.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import argparse import sys import pydoop.hdfs as hdfs from pydoop.mapreduce.pipes import OpaqueSplit, write_opaque_splits N_TASKS = 2 ITEMS_PER_TASK = 5 def gen_ranges(): for i in range(N_TASKS): start = ITEMS_PER_TASK * i yield start, start + ITEMS_PER_TASK if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("splits_path", metavar="HDFS_PATH") args = parser.parse_args(sys.argv[1:]) splits = [OpaqueSplit(_) for _ in gen_ranges()] with hdfs.open(args.splits_path, "wb") as f: write_opaque_splits(splits, f) ================================================ FILE: int_test/opaque_split/mrapp.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ Stub of an application where each task is assigned an int range as a (start, stop) tuple. The record reader feeds numbers from the specified range to the mapper, which in this case does nothing but generate a random string. Besides random data generation (e.g., for terasort), this could be used to assign a subset of files from an HDFS directory to each mapper (e.g., for image recognition). """ from __future__ import division import uuid import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes # py2 compat try: range = xrange except NameError: pass class Reader(api.RecordReader): def __init__(self, context): super(Reader, self).__init__(context) start, stop = context.input_split.payload self.gen = iter(range(start, stop)) self.nitems = max(stop - start, 0) self.key = self.start = start def next(self): self.key = next(self.gen) return self.key, None def get_progress(self): done = self.key - self.start + 1 return min(done / self.nitems, 1.0) class Mapper(api.Mapper): def map(self, context): context.emit(context.key, uuid.uuid4().hex) def __main__(): pipes.run_task(pipes.Factory(Mapper, record_reader_class=Reader)) if __name__ == "__main__": __main__() ================================================ FILE: int_test/opaque_split/run ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" module="mrapp" input="input" output="output" splits_path="${input}.splits" opts=( "-D" "pydoop.mapreduce.pipes.externalsplits.uri=${splits_path}" "-D" "mapreduce.task.timeout=10000" "-D" "mapreduce.job.maps=2" "--python-program" "${PYTHON}" "--job-name" "${module}" "--num-reducers" "0" "--upload-file-to-cache" "${this_dir}/${module}.py" "--do-not-use-java-record-reader" ) [ -n "${DEBUG:-}" ] && opts+=( "--log-level" "DEBUG" ) pushd "${this_dir}" ${PYTHON} gen_splits.py "${splits_path}" ensure_dfs_home ${HDFS} dfs -rm -r -f "${input}" "${output}" ${HDFS} dfs -mkdir -p "${input}" # TODO: can we remove this constraint? ${PYDOOP} submit "${opts[@]}" ${module} "${input}" "${output}" wd=$(mktemp -d) ${HDFS} dfs -get "${output}" "${wd}/output" ${PYTHON} check.py "${wd}/output" rm -rf "${wd}" popd ================================================ FILE: int_test/progress/mrapp.py ================================================ #!/usr/bin/env python # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import time import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes import pydoop.hdfs as hdfs class Mapper(api.Mapper): def map(self, context): time.sleep(1) sys.stderr.write("processing: %r\n" % (context.value,)) context.emit(context.key, len(context.value)) class Writer(api.RecordWriter): def __init__(self, context): super(Writer, self).__init__(context) jc = context.job_conf outfn = context.get_default_work_file() hdfs_user = jc.get("pydoop.hdfs.user", None) self.file = hdfs.open(outfn, "wt", user=hdfs_user) self.sep = jc.get("mapreduce.output.textoutputformat.separator", "\t") def close(self): self.file.close() def emit(self, key, value): self.file.write(str(key) + self.sep + str(value) + "\n") FACTORY = pipes.Factory(Mapper, record_writer_class=Writer) def __main__(): pipes.run_task(FACTORY) ================================================ FILE: int_test/progress/run ================================================ #!/usr/bin/env bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail [ -n "${DEBUG:-}" ] && set -x this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/../config.sh" MODULE="mrapp" TIMEOUT_SECS=10 N_LINES=$((5 * TIMEOUT_SECS)) OPTS=( "-D" "mapreduce.task.timeout=$((1000 * TIMEOUT_SECS))" "-D" "mapreduce.job.maps=1" "--python-program" "${PYTHON}" "--job-name" "${MODULE}" "--num-reducers" "0" "--upload-file-to-cache" "${this_dir}/${MODULE}.py" "--do-not-use-java-record-writer" ) [ -n "${DEBUG:-}" ] && OPTS+=( "--log-level" "DEBUG" ) WD=$(mktemp -d) DATA="${WD}"/${RANDOM} for i in $(seq 1 ${N_LINES}); do echo "foobar_${i}" >> "${DATA}" done if [ "$(hadoop_fs)" != "file" ]; then ensure_dfs_home INPUT=$(basename ${DATA})_in OUTPUT=$(basename ${DATA})_out ${HDFS} dfs -rm -r -f "${INPUT}" "${OUTPUT}" ${HDFS} dfs -put "${DATA}" "${INPUT}" else INPUT="${DATA}" OUTPUT="${WD}"/$(basename ${DATA})_out fi ${PYDOOP} submit "${OPTS[@]}" ${MODULE} "${INPUT}" "${OUTPUT}" ${HDFS} dfs -test -e "${OUTPUT}"/part-m-00000 rm -rf "${WD}" ================================================ FILE: int_test/run_all ================================================ #!/bin/bash # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT set -euo pipefail this="${BASH_SOURCE-$0}" this_dir=$(cd -P -- "$(dirname -- "${this}")" && pwd -P) . "${this_dir}/config.sh" tests=( progress ) # https://issues.apache.org/jira/browse/MAPREDUCE-4000 if [ "$(hadoop_fs)" != "file" ]; then tests+=( mapred_submitter ) fi for e in ${tests[@]}; do pushd "${this_dir}/${e}" ./run popd done ================================================ FILE: notice_template.txt ================================================ Copyright %(year) %(owner). Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: pydoop/__init__.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT # DEV NOTE: some of the variables defined here (docstring included) # are parsed by setup.py, check it before modifying them. """ Pydoop: a Python MapReduce and HDFS API for Hadoop -------------------------------------------------- Pydoop is a Python interface to Hadoop that allows you to write MapReduce applications and interact with HDFS in pure Python. """ import os import errno from importlib import import_module import pydoop.hadoop_utils as hu from pydoop.utils.py3compat import configparser, parser_read try: from pydoop.version import version as __version__ except ImportError: # should only happen at compile time __version__ = None _PATH_FINDER = hu.PathFinder() __author__ = ", ".join(( "Simone Leo", "Gianluigi Zanetti", "Luca Pireddu", "Francesco Cabras", "Mauro Del Rio", "Marco Enrico Piras", )) __author_email__ = ", ".join(( "", "", "", "", "", "", )) __url__ = "http://crs4.github.io/pydoop" __propfile_basename__ = "pydoop.properties" def reset(): _PATH_FINDER.reset() def hadoop_home(): return _PATH_FINDER.hadoop_home() def hadoop_conf(): return _PATH_FINDER.hadoop_conf() def hadoop_params(): return _PATH_FINDER.hadoop_params() def hadoop_classpath(): return _PATH_FINDER.hadoop_classpath() def package_dir(): return os.path.dirname(os.path.abspath(__file__)) ############################## # Since Pydoop 1.0, we've stopped supporting installations for multiple # Hadoop versions, so we only have a single module, so the following # functions now return the same value regardless of the Hadoop version. ############################## def jar_name(hadoop_vinfo=None): return "pydoop.jar" def jar_path(hadoop_vinfo=None): path = os.path.join(package_dir(), jar_name()) if os.path.exists(path): return path else: return None def complete_mod_name(module, hadoop_vinfo=None): return "%s.%s" % (__package__, module) def import_version_specific_module(name): return import_module(name) # --- get properties --- PROP_FN = os.path.join( os.path.dirname(os.path.abspath(__file__)), __propfile_basename__ ) # http://stackoverflow.com/questions/2819696 class AddSectionWrapper(object): SEC_NAME = 'dummy' def __init__(self, f): self.f = f self.sechead = '[dummy]' + os.linesep def __iter__(self): return self def __next__(self): line = self.readline() if not line: raise StopIteration return line def readline(self): if self.sechead: try: return self.sechead finally: self.sechead = None else: return self.f.readline() def read_properties(fname): parser = configparser.SafeConfigParser() parser.optionxform = str # preserve key case try: with open(fname) as f: parser_read(parser, AddSectionWrapper(f)) except IOError as e: if e.errno != errno.ENOENT: raise return None # compile time, prop file is not there return dict(parser.items(AddSectionWrapper.SEC_NAME)) class LocalModeNotSupported(RuntimeError): def __init__(self): msg = 'ERROR: Hadoop is configured to run in local mode' super(LocalModeNotSupported, self).__init__(msg) def check_local_mode(): if _PATH_FINDER.is_local(): raise LocalModeNotSupported() ================================================ FILE: pydoop/app/__init__.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT ================================================ FILE: pydoop/app/argparse_types.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import argparse import pydoop.hdfs as hdfs def kv_pair(s): try: k, v = s.split("=", 1) except ValueError: raise argparse.ArgumentTypeError("arg must be in the k=v form") return k, v class UpdateMap(argparse.Action): """\ Update the destination map with a K=V pair. >>> parser = argparse.ArgumentParser() >>> _ = parser.add_argument("-D", metavar="K=V", action=UpdateMap) >>> args = parser.parse_args(["-D", "k1=v1", "-D", "k2=v2", "-D", "k2=v3"]) >>> args.D == {'k1': 'v1', 'k2': 'v3'} True """ def __init__(self, option_strings, dest, **kwargs): kwargs = {k: v for k, v in kwargs.items() if k in {"help", "metavar"}} kwargs["type"] = kv_pair super(UpdateMap, self).__init__(option_strings, dest, **kwargs) def __call__(self, parser, namespace, values, option_string=None): if getattr(namespace, self.dest, None) is None: setattr(namespace, self.dest, {}) getattr(namespace, self.dest).update([values]) def a_file_that_can_be_read(x): with open(x, 'r'): pass return x def a_hdfs_file(x): _, _, _ = hdfs.path.split(x) return x def a_comma_separated_list(x): # FIXME unclear how does one check for bad lists... return x ================================================ FILE: pydoop/app/main.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Pydoop command line tool. """ import os import argparse import importlib import sys from pydoop.version import version SUBMOD_NAMES = [ "script", "submit", ] PYDOOP_CONF_FILE = "~/.pydoop/pydoop.conf" class PatchedArgumentParser(argparse.ArgumentParser): """ This is a work-around for a bug in ArgumentParser that is triggered when there is a zero length argument and fromfile_prefix_chars is not None. """ def _read_args_from_files(self, arg_strings): place_holder = "abcjdkje-32333a290" assert not (place_holder in arg_strings) args = [x if len(x) > 0 else place_holder for x in arg_strings] new_args = super(PatchedArgumentParser, self)._read_args_from_files(args) return [x if x != place_holder else '' for x in new_args] def make_parser(): parser = PatchedArgumentParser( description="Pydoop command line tool", formatter_class=argparse.ArgumentDefaultsHelpFormatter, epilog=("Supports argparse @confile syntax "), fromfile_prefix_chars='@' ) parser._pydoop_docs_helper = {} parser.add_argument('-V', '--version', action='version', version=version, help='print version number and exit') subparsers = parser.add_subparsers(help="sub-commands") for n in SUBMOD_NAMES: mod = importlib.import_module("%s.%s" % (__package__, n)) subp = mod.add_parser(subparsers) parser._pydoop_docs_helper[n] = subp return parser def main(argv=None): parser = make_parser() if os.path.exists(PYDOOP_CONF_FILE): argv = argv + ['@' + PYDOOP_CONF_FILE] args, unknown = parser.parse_known_args(argv) try: if args.combiner_fn and not args.combine_fn: args.combine_fn = args.combiner_fn # backwards compatibility except AttributeError: # not the script app pass try: func = args.func except AttributeError: parser.error("too few arguments") try: func(args, unknown) except RuntimeError as e: sys.exit("ERROR - {}: {}".format(type(e).__name__, e)) ================================================ FILE: pydoop/app/script.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Pydoop Script ============= A quick and easy to use interface for running simple MapReduce jobs. Pydoop script is a front-end to pydoop submit that automatically builds a map-reduce program using functions contained in a user provided python module. """ import os import pydoop.utils as utils import argparse from tempfile import NamedTemporaryFile from zipfile import ZipFile from .submit import PydoopSubmitter, add_parser_common_arguments from .script_template import DRIVER_TEMPLATE OUT_SEP_KEY = 'mapreduce.output.textoutputformat.separator' NOSEP_OUTPUT_FORMAT = 'it.crs4.pydoop.NoSeparatorTextOutputFormat' DESCRIPTION = "Simplified interface for running simple MapReduce jobs" class PydoopScript(object): def __init__(self, args, unknown_args): self.script_archive = None self.args = None self.convert_args(args, unknown_args) @staticmethod def generate_driver(mr_module, args): combine_fn = args.combine_fn or args.reduce_fn combiner_wp = 'PydoopScriptCombiner' if args.combine_fn else 'None' return DRIVER_TEMPLATE.substitute( module=mr_module, map_fn=args.map_fn, reduce_fn=args.reduce_fn, combine_fn=combine_fn, combiner_wp=combiner_wp, ) def convert_args(self, args, unknown_args): # Create a zip archive containing all we need to run the # script (including the script itself. We use # NamedTemporaryFile which will take care of deleting the temp # archive once we're done self.script_archive = NamedTemporaryFile( prefix="pydoop_script_", suffix='.zip' ) zip_filename = self.script_archive.name # Create a one-off temporary file name to avoid name clashes # in the distcache. Keep the same module extension -- it may # be a source file or a byte-compiled file mr_module = utils.make_random_str( prefix="pydoop_script_module_", postfix=os.path.basename(args.module) ) mr_driver = utils.make_random_str(prefix="pydoop_script_driver_") with ZipFile(zip_filename, 'w') as zipf: zipf.write(args.module, arcname=mr_module) zipf.writestr( mr_driver + '.py', self.generate_driver(os.path.splitext(mr_module)[0], args) ) if args.python_zip is None: args.python_zip = [zip_filename] else: args.python_zip.append(zip_filename) args.module = mr_driver args.entry_point = 'main' args.program = mr_driver args.do_not_use_java_record_reader = False args.do_not_use_java_record_writer = False args.output_format = None args.cache_file = None args.cache_archive = None args.upload_to_cache = None args.libjars = None args.conf = None args.disable_property_name_conversion = True args.avro_input = None args.avro_output = None args.keep_wd = False args.pstats_dir = None args.pstats_fmt = None self.args, self.unknown_args = args, unknown_args def run(self): submitter = PydoopSubmitter() if self.args.kv_separator is not None: submitter.properties[OUT_SEP_KEY] = self.args.kv_separator if submitter.properties.get(OUT_SEP_KEY) == '': self.args.output_format = NOSEP_OUTPUT_FORMAT submitter.set_args(self.args, self.unknown_args) submitter.run() return 0 def clean(self): self.script_archive.close() def run(args, unknown_args=None): if unknown_args is None: unknown_args = [] scripter = PydoopScript(args, unknown_args) scripter.run() scripter.clean() return 0 def add_parser_arguments(parser): parser.add_argument('module', metavar='MODULE', help='python module file') parser.add_argument('input', metavar='INPUT', help='hdfs input path') parser.add_argument('output', metavar='OUTPUT', help='hdfs output path') parser.add_argument('-m', '--map-fn', metavar='MAP', default='mapper', help="name of map function within module") parser.add_argument('-r', '--reduce-fn', metavar='RED', default='reducer', help="name of reduce function within module") parser.add_argument('-c', '--combine-fn', metavar='COM', default=None, help="name of combine function within module") parser.add_argument('--combiner-fn', metavar='COM', default=None, help="--combine-fn alias for backwards compatibility") parser.add_argument('-t', '--kv-separator', metavar='SEP', help="output key-value separator") def add_parser(subparsers): parser = subparsers.add_parser( "script", description=DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter, epilog=("Hadoop pipes generic options are supported too. " "Run `hadoop pipes` for more information") ) add_parser_common_arguments(parser) add_parser_arguments(parser) parser.set_defaults(func=run) return parser ================================================ FILE: pydoop/app/script_template.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import string DRIVER_TEMPLATE = string.Template("""\ import sys import os import inspect sys.path.insert(0, os.getcwd()) import pydoop.mapreduce.api as api # noqa: E402 import pydoop.mapreduce.pipes as pipes # noqa: E402 import ${module} # noqa: E402 class ContextWriter(object): def __init__(self, context): self.context = context self.counters = {} def emit(self, k, v): self.context.emit(k, v) def count(self, what, howmany): counter = self.counters.setdefault( what, self.context.get_counter('${module}', what) ) self.context.increment_counter(counter, howmany) def status(self, msg): self.context.set_status(msg) def progress(self): self.context.progress() def setup_script_object(obj, fn_attr_name, user_fn, ctx): # Generic constructor for both map and reduce objects. # # Sets the 'writer' and 'conf' attributes. Then, based on the arity # of the given user function (user_fn), sets the object attribute # (fn_attr_name, which should be either 'map' or 'reduce') to point # to either: # # * obj.with_conf (when arity == 4) # * obj.without_conf (when arity == 3) # # This way, when pipes calls the map/reduce function of the object # it actually gets either of the with_conf/without_conf functions # (which must be defined by the PydoopScriptMapper or # PydoopScriptReducer object passed into this function). # # Why all this? The idea is to raise any decision about which # function to call out of the map/reduce functions, which get called # a number of times proportional to the amount of data to process. # On the other hand, the constructor only gets called once per task. if fn_attr_name not in ('map', 'reduce'): raise RuntimeError('Unexpected function attribute ' + fn_attr_name) obj.writer = ContextWriter(ctx) obj.conf = ctx.get_job_conf() spec = inspect.getargspec(user_fn) if spec.varargs or len(spec.args) not in (3, 4): raise ValueError( user_fn + ' must take parameters key, value, writer, and optionally config' ) if len(spec.args) == 3: setattr(obj, fn_attr_name, obj.without_conf) elif len(spec.args) == 4: setattr(obj, fn_attr_name, obj.with_conf) else: raise RuntimeError( 'Unexpected number of ${map_fn} arguments ' + len(spec.args) ) class PydoopScriptMapper(api.Mapper): def __init__(self, ctx): super(PydoopScriptMapper, self).__init__(ctx) setup_script_object(self, 'map', ${module}.${map_fn}, ctx) def without_conf(self, ctx): # old style map function, without the conf parameter writer = ContextWriter(ctx) ${module}.${map_fn}(ctx.key, ctx.value, writer) def with_conf(self, ctx): # new style map function, without the conf parameter writer = ContextWriter(ctx) ${module}.${map_fn}(ctx.key, ctx.value, writer, self.conf) def map(self, ctx): pass class PydoopScriptReducer(api.Reducer): def __init__(self, ctx): super(PydoopScriptReducer, self).__init__(ctx) setup_script_object(self, 'reduce', ${module}.${reduce_fn}, ctx) def without_conf(self, ctx): writer = ContextWriter(ctx) ${module}.${reduce_fn}(ctx.key, ctx.values, writer) def with_conf(self, ctx): writer = ContextWriter(ctx) ${module}.${reduce_fn}(ctx.key, ctx.values, writer, self.conf) def reduce(self, ctx): pass class PydoopScriptCombiner(api.Reducer): def __init__(self, ctx): super(PydoopScriptCombiner, self).__init__(ctx) setup_script_object(self, 'reduce', ${module}.${combine_fn}, ctx) def without_conf(self, ctx): writer = ContextWriter(ctx) ${module}.${combine_fn}(ctx.key, ctx.values, writer) def with_conf(self, ctx): writer = ContextWriter(ctx) ${module}.${combine_fn}(ctx.key, ctx.values, writer, self.conf) def reduce(self, ctx): pass def main(): pipes.run_task(pipes.Factory( PydoopScriptMapper, PydoopScriptReducer, record_reader_class=None, record_writer_class=None, combiner_class=${combiner_wp}, partitioner_class=None)) """) ================================================ FILE: pydoop/app/submit.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ An interface to simplify pydoop jobs submission. """ import os import sys import glob import argparse import logging import uuid logging.basicConfig(level=logging.INFO) import pydoop import pydoop.hdfs as hdfs import pydoop.hadut as hadut import pydoop.utils as utils import pydoop.utils.conversion_tables as conv_tables from pydoop.mapreduce.api import AVRO_IO_MODES from pydoop.mapreduce.pipes import PSTATS_DIR, PSTATS_FMT from .argparse_types import a_file_that_can_be_read, UpdateMap from .argparse_types import a_comma_separated_list, a_hdfs_file DEFAULT_ENTRY_POINT = '__main__' IS_JAVA_RR = "mapreduce.pipes.isjavarecordreader" IS_JAVA_RW = "mapreduce.pipes.isjavarecordwriter" CACHE_FILES = "mapreduce.job.cache.files" CACHE_ARCHIVES = "mapreduce.job.cache.archives" USER_HOME = "mapreduce.admin.user.home.dir" JOB_REDUCES = "mapreduce.job.reduces" JOB_NAME = "mapreduce.job.name" COMPRESS_MAP_OUTPUT = "mapreduce.map.output.compress" class PydoopSubmitter(object): """ Builds and launches pydoop jobs. """ DESCRIPTION = "Simplified pydoop jobs submission" def __init__(self): pydoop.check_local_mode() self.logger = logging.getLogger("PydoopSubmitter") self.properties = { CACHE_FILES: '', CACHE_ARCHIVES: '', 'mapred.create.symlink': 'yes', # backward compatibility COMPRESS_MAP_OUTPUT: 'true', } self.args = None self.requested_env = dict() self.remote_wd = None self.remote_module = None self.remote_module_bn = None self.remote_exe = None self.pipes_code = None self.files_to_upload = [] self.unknown_args = None @staticmethod def __cache_archive_link(archive_name): # XXX: should we really be dropping the extension from the link name? return os.path.splitext(os.path.basename(archive_name))[0] def __set_files_to_cache_helper(self, prop, upload_and_cache, cache): cfiles = self.properties[prop] if self.properties[prop] else [] cfiles += cache if cache else [] if upload_and_cache: upf_to_cache = [ ('file://' + os.path.realpath(e), hdfs.path.join(self.remote_wd, bn), bn if prop == CACHE_FILES else self.__cache_archive_link(e)) for (e, bn) in ((e, os.path.basename(e)) for e in upload_and_cache) ] self.files_to_upload += upf_to_cache for t in self.files_to_upload: if not hdfs.path.isfile(t[0]): raise RuntimeError("not a file: %r" % (t[0])) cached_files = ["%s#%s" % (h, b) for (_, h, b) in upf_to_cache] cfiles += cached_files self.properties[prop] = ','.join(cfiles) def __set_files_to_cache(self, args): if args.upload_file_to_cache is None: args.upload_file_to_cache = [] self.__set_files_to_cache_helper(CACHE_FILES, args.upload_file_to_cache, args.cache_file) def __set_archives_to_cache(self, args): if args.upload_archive_to_cache is None: args.upload_archive_to_cache = [] if args.python_zip: args.upload_archive_to_cache += args.python_zip self.__set_files_to_cache_helper(CACHE_ARCHIVES, args.upload_archive_to_cache, args.cache_archive) @staticmethod def _env_arg_to_dict(set_env_list): retval = dict() for item in set_env_list: try: name, value = item.split('=', 1) retval[name.strip()] = value.strip() except ValueError: raise RuntimeError( "Bad syntax in env variable argument '%s'" % item ) return retval def set_args(self, args, unknown_args=None): """ Configure job, based on the arguments provided. """ if unknown_args is None: unknown_args = [] self.logger.setLevel(getattr(logging, args.log_level)) parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/"))) self.remote_wd = hdfs.path.join( parent, utils.make_random_str(prefix="pydoop_submit_") ) self.remote_exe = hdfs.path.join(self.remote_wd, str(uuid.uuid4())) self.properties[JOB_NAME] = args.job_name or 'pydoop' self.properties[IS_JAVA_RR] = ( 'false' if args.do_not_use_java_record_reader else 'true' ) self.properties[IS_JAVA_RW] = ( 'false' if args.do_not_use_java_record_writer else 'true' ) if args.num_reducers is not None: self.properties[JOB_REDUCES] = args.num_reducers if args.job_name: self.properties[JOB_NAME] = args.job_name self.properties.update(args.job_conf or {}) self.__set_files_to_cache(args) self.__set_archives_to_cache(args) self.requested_env = self._env_arg_to_dict(args.set_env or []) self.args = args self.unknown_args = unknown_args def __warn_user_if_wd_maybe_unreadable(self, abs_remote_path): """ Check directories above the remote module and issue a warning if they are not traversable by all users. The reasoning behind this is mainly aimed at set-ups with a centralized Hadoop cluster, accessed by all users, and where the Hadoop task tracker user is not a superuser; an example may be if you're running a shared Hadoop without HDFS (using only a POSIX shared file system). The task tracker correctly changes user to the job requester's user for most operations, but not when initializing the distributed cache, so jobs who want to place files not accessible by the Hadoop user into dist cache fail. """ host, port, path = hdfs.path.split(abs_remote_path) if host == '' and port == 0: # local file system host_port = "file:///" else: # FIXME: this won't work with any scheme other than # hdfs:// (e.g., s3) host_port = "hdfs://%s:%s/" % (host, port) path_pieces = path.strip('/').split(os.path.sep) fs = hdfs.hdfs(host, port) for i in range(0, len(path_pieces)): part = os.path.join( host_port, os.path.sep.join(path_pieces[0: i + 1]) ) permissions = fs.get_path_info(part)['permissions'] if permissions & 0o111 != 0o111: self.logger.warning( ("remote module %s may not be readable by the task " "tracker when initializing the distributed cache. " "Permissions on %s: %s"), abs_remote_path, part, oct(permissions) ) break def _generate_pipes_code(self): env = dict() for e in ('LD_LIBRARY_PATH', 'PATH', 'PYTHONPATH'): env[e] = '' lines = [] if not self.args.no_override_env and not self.args.no_override_ld_path: env['LD_LIBRARY_PATH'] = os.environ.get('LD_LIBRARY_PATH', '') if not self.args.no_override_env and not self.args.no_override_path: env['PATH'] = os.environ.get('PATH', '') if not self.args.no_override_env and not self.args.no_override_pypath: env['PYTHONPATH'] = os.environ.get('PYTHONPATH', '') else: env['PYTHONPATH'] = "${PYTHONPATH}" # set user-requested env variables for var, value in self.requested_env.items(): env[var] = value if self.args.pstats_dir: env[PSTATS_DIR] = self.args.pstats_dir if self.args.pstats_fmt: env[PSTATS_FMT] = self.args.pstats_fmt executable = self.args.python_program if self.args.python_zip: env['PYTHONPATH'] = ':'.join([ self.__cache_archive_link(ar) for ar in self.args.python_zip ] + [env['PYTHONPATH']]) # Note that we have to explicitly put the working directory # in the python path otherwise it will miss cached modules and # packages. env['PYTHONPATH'] = "${PWD}:" + env['PYTHONPATH'] lines.append("#!/bin/bash") lines.append('""":"') if self.args.log_level == "DEBUG": lines.append("printenv 1>&2") lines.append("echo PWD=${PWD} 1>&2") lines.append("echo ls -l; ls -l 1>&2") if ( USER_HOME not in self.properties and "HOME" in os.environ and not self.args.no_override_home ): lines.append('export HOME="%s"' % os.environ['HOME']) # set environment variables for var, value in env.items(): if value: self.logger.debug("Setting env variable %s=%s", var, value) lines.append('export %s="%s"' % (var, value)) if self.args.log_level == "DEBUG": lines.append("echo PATH=${PATH} 1>&2") lines.append("echo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} 1>&2") lines.append("echo PYTHONPATH=${PYTHONPATH} 1>&2") lines.append("echo HOME=${HOME} 1>&2") lines.append('echo "executable is $(type -P %s)" 1>&2' % executable) cmd = 'exec "%s" -u "$0" "$@"' % executable if self.args.log_level == 'DEBUG': lines.append("echo cmd to execute: %s" % cmd) lines.append(cmd) lines.append('":"""') if self.args.log_level == "DEBUG": lines.append('import sys') lines.append('sys.stderr.write("%r\\n" % sys.path)') lines.append('sys.stderr.write("%s\\n" % sys.version)') lines.append('import %s as module' % self.args.module) lines.append('module.%s()' % self.args.entry_point) return os.linesep.join(lines) + os.linesep def __validate(self): if not hdfs.path.exists(self.args.input): raise RuntimeError( "Input path %r does not exist" % (self.args.input,) ) if hdfs.path.exists(self.args.output): raise RuntimeError( "Output path %r already exists" % (self.args.output,) ) def __clean_wd(self): if self.remote_wd: try: self.logger.debug( "Removing temporary working directory %s", self.remote_wd ) hdfs.rm(self.remote_wd) except IOError: pass def __setup_remote_paths(self): """ Actually create the working directory and copy the module into it. Note: the script has to be readable by Hadoop; though this may not generally be a problem on HDFS, where the Hadoop user is usually the superuser, things may be different if our working directory is on a shared POSIX filesystem. Therefore, we make the directory and the script accessible by all. """ self.logger.debug("remote_wd: %s", self.remote_wd) self.logger.debug("remote_exe: %s", self.remote_exe) self.logger.debug("remotes: %s", self.files_to_upload) if self.args.module: self.logger.debug( 'Generated pipes_code:\n\n %s', self._generate_pipes_code() ) if not self.args.pretend: hdfs.mkdir(self.remote_wd) hdfs.chmod(self.remote_wd, "a+rx") self.logger.debug("created and chmod-ed: %s", self.remote_wd) pipes_code = self._generate_pipes_code() hdfs.dump(pipes_code, self.remote_exe) self.logger.debug("dumped pipes_code to: %s", self.remote_exe) hdfs.chmod(self.remote_exe, "a+rx") self.__warn_user_if_wd_maybe_unreadable(self.remote_wd) for (l, h, _) in self.files_to_upload: self.logger.debug("uploading: %s to %s", l, h) hdfs.cp(l, h) self.logger.debug("Created%sremote paths:" % (' [simulation] ' if self.args.pretend else ' ')) def run(self): if self.args is None: raise RuntimeError("cannot run without args, please call set_args") if not self.args.pretend: self.__validate() pydoop_classpath = [] libjars = [] if self.args.libjars: libjars.extend(self.args.libjars) if self.args.avro_input or self.args.avro_output: # append Pydoop's avro-mapred jar. Don't put it at the front of # the list or the user won't be able to override it. avro_jars = glob.glob(os.path.join( pydoop.package_dir(), "avro*.jar" )) pydoop_classpath.extend(avro_jars) libjars.extend(avro_jars) pydoop_jar = pydoop.jar_path() if pydoop_jar is None: raise RuntimeError("Can't find pydoop.jar") job_args = [] submitter_class = 'it.crs4.pydoop.mapreduce.pipes.Submitter' pydoop_classpath.append(pydoop_jar) libjars.append(pydoop_jar) self.logger.debug("Submitter class: %s", submitter_class) if self.args.hadoop_conf: job_args.extend(['-conf', self.args.hadoop_conf.name]) if self.args.input_format: job_args.extend(['-inputformat', self.args.input_format]) if self.args.output_format: job_args.extend(['-writer', self.args.output_format]) job_args.extend(['-input', self.args.input]) job_args.extend(['-output', self.args.output]) job_args.extend(['-program', self.remote_exe]) if libjars: job_args.extend(["-libjars", ','.join(libjars)]) if self.args.avro_input: job_args.extend(['-avroInput', self.args.avro_input]) if self.args.avro_output: job_args.extend(['-avroOutput', self.args.avro_output]) if not self.args.disable_property_name_conversion: ctable = conv_tables.mrv1_to_mrv2 props = [ (ctable.get(k, k), v) for (k, v) in self.properties.items() ] self.properties = dict(props) self.logger.debug("properties after projection: %r", self.properties) try: self.__setup_remote_paths() executor = (hadut.run_class if not self.args.pretend else self.fake_run_class) executor(submitter_class, args=job_args, properties=self.properties, classpath=pydoop_classpath, logger=self.logger, keep_streams=False) self.logger.info("Done") finally: if not self.args.keep_wd: self.__clean_wd() def fake_run_class(self, *args, **kwargs): kwargs['logger'].info("Fake run class") repr_list = [repr(_) for _ in args] repr_list.extend('%s=%r' % (k, v) for k, v in kwargs.items()) sys.stdout.write("hadut.run_class(%s)\n" % ', '.join(repr_list)) def run(args, unknown_args=None): if unknown_args is None: unknown_args = [] script = PydoopSubmitter() script.set_args(args, unknown_args) script.run() return 0 def add_parser_common_arguments(parser): parser.add_argument( '--num-reducers', metavar='INT', type=int, help="Number of reduce tasks. Specify 0 to only perform map phase" ) parser.add_argument( '--no-override-home', action='store_true', help=("Don't set the script's HOME directory to the $HOME in your " "environment. Hadoop will set it to the value of the " "'mapreduce.admin.user.home.dir' property") ) parser.add_argument( '--no-override-env', action='store_true', help=("Use the default PATH, LD_LIBRARY_PATH and PYTHONPATH, instead " "of copying them from the submitting client node") ) parser.add_argument( '--no-override-ld-path', action='store_true', help=("Use the default LD_LIBRARY_PATH instead of copying it from the " "submitting client node") ) parser.add_argument( '--no-override-pypath', action='store_true', help=("Use the default PYTHONPATH instead of copying it from the " "submitting client node") ) parser.add_argument( '--no-override-path', action='store_true', help=("Use the default PATH instead of copying it from the " "submitting client node") ) parser.add_argument( '--set-env', metavar="VAR=VALUE", type=str, action="append", help=("Set environment variables for the tasks. If a variable " "is set to '', it will not be overridden by Pydoop.") ) parser.add_argument( '-D', '--job-conf', metavar='NAME=VALUE', action=UpdateMap, help='Set a Hadoop property, e.g., -D mapreduce.job.priority=high' ) parser.add_argument( '--python-zip', metavar='ZIP_FILE', type=a_file_that_can_be_read, action="append", help="Additional python zip file" ) parser.add_argument( '--upload-file-to-cache', metavar='FILE', type=a_file_that_can_be_read, action="append", help="Upload and add this file to the distributed cache." ) parser.add_argument( '--upload-archive-to-cache', metavar='FILE', type=a_file_that_can_be_read, action="append", help="Upload and add this archive file to the distributed cache." ) parser.add_argument( '--log-level', metavar="LEVEL", default="INFO", help="Logging level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL", "FATAL"] ) parser.add_argument( '--job-name', metavar='NAME', type=str, help="name of the job" ) parser.add_argument( '--python-program', metavar='PYTHON', type=str, default=sys.executable, help="python executable that should be used by the wrapper" ) parser.add_argument( '--pretend', action='store_true', help=("Do not actually submit a job, print the generated config " "settings and the command line that would be invoked") ) parser.add_argument( '--hadoop-conf', metavar='HADOOP_CONF_FILE', type=a_file_that_can_be_read, help="Hadoop configuration file" ) parser.add_argument( '--input-format', metavar='CLASS', type=str, help="java classname of InputFormat" ) def add_parser_arguments(parser): parser.add_argument( 'module', metavar='MODULE', type=str, help=("The module containing the Python MapReduce program") ) parser.add_argument( 'input', metavar='INPUT', help='input path to the maps', ) parser.add_argument( 'output', metavar='OUTPUT', help='output path from the reduces', ) parser.add_argument( '--disable-property-name-conversion', action='store_true', help="Do not adapt property names to the hadoop version used." ) parser.add_argument( '--do-not-use-java-record-reader', action='store_true', help="Disable java RecordReader" ) parser.add_argument( '--do-not-use-java-record-writer', action='store_true', help="Disable java RecordWriter" ) parser.add_argument( '--output-format', metavar='CLASS', type=str, help="java classname of OutputFormat" ) parser.add_argument( '--libjars', metavar='JAR_FILE', type=a_comma_separated_list, action="append", help="Additional comma-separated list of jar files" ) parser.add_argument( '--cache-file', metavar='HDFS_FILE', type=a_hdfs_file, action="append", help="Add this HDFS file to the distributed cache as a file." ) parser.add_argument( '--cache-archive', metavar='HDFS_FILE', type=a_hdfs_file, action="append", help="Add this HDFS archive file to the distributed cache" + "as an archive." ) parser.add_argument( '--entry-point', metavar='ENTRY_POINT', type=str, default=DEFAULT_ENTRY_POINT, help=("Explicitly execute MODULE.ENTRY_POINT() " "in the launcher script.") ) parser.add_argument( '--avro-input', metavar='k|v|kv', choices=AVRO_IO_MODES, help="Avro input mode (key, value or both)", ) parser.add_argument( '--avro-output', metavar='k|v|kv', choices=AVRO_IO_MODES, help="Avro output mode (key, value or both)", ) parser.add_argument( '--pstats-dir', metavar="HDFS_DIR", type=str, help="Profile each task and store stats in this dir" ) parser.add_argument( '--pstats-fmt', metavar="STRING", type=str, help="pstats filename pattern (expert use only)" ) parser.add_argument( '--keep-wd', action='store_true', help="Don't remove the work dir" ) def add_parser(subparsers): parser = subparsers.add_parser( "submit", description=PydoopSubmitter.DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) add_parser_common_arguments(parser) add_parser_arguments(parser) parser.set_defaults(func=run) return parser ================================================ FILE: pydoop/avrolib.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Avro tools. """ # DEV NOTE: since Avro is not a requirement, do *not* import this # module unconditionally anywhere in the main code (importing it in # the Avro examples is OK, ofc). import sys import avro.schema from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter, BinaryDecoder, BinaryEncoder from pydoop.mapreduce.api import RecordWriter, RecordReader import pydoop.hdfs as hdfs from pydoop.utils.py3compat import StringIO parse = avro.schema.Parse if sys.version_info[0] == 3 else avro.schema.parse class Deserializer(object): def __init__(self, schema_str): schema = parse(schema_str) self.reader = DatumReader(schema) def deserialize(self, rec_bytes): return self.reader.read(BinaryDecoder(StringIO(rec_bytes))) class Serializer(object): def __init__(self, schema_str): schema = parse(schema_str) self.writer = DatumWriter(schema) def serialize(self, record): f = StringIO() encoder = BinaryEncoder(f) self.writer.write(record, encoder) return f.getvalue() try: from pyavroc import AvroDeserializer except ImportError: AvroDeserializer = Deserializer try: from pyavroc import AvroSerializer except ImportError: AvroSerializer = Serializer class SeekableDataFileReader(DataFileReader): FORWARD_WINDOW_SIZE = 8192 def align_after(self, offset): """ Search for a sync point after offset and align just after that. """ f = self.reader if offset <= 0: # FIXME what is a negative offset?? f.seek(0) self._block_count = 0 self._read_header() # FIXME we can't estimate how big it is... return sm = self.sync_marker sml = len(sm) pos = offset while pos < self.file_length - sml: f.seek(pos) data = f.read(self.FORWARD_WINDOW_SIZE) sync_offset = data.find(sm) if sync_offset > -1: f.seek(pos + sync_offset) self._block_count = 0 return pos += len(data) # FIXME this is just an example with no error checking class AvroReader(RecordReader): """ Avro data file reader. Reads all data blocks that begin within the given input split. """ def __init__(self, ctx): super(AvroReader, self).__init__(ctx) isplit = ctx.input_split self.region_start = isplit.offset self.region_end = isplit.offset + isplit.length self.reader = SeekableDataFileReader(hdfs.open(isplit.filename), DatumReader()) self.reader.align_after(isplit.offset) def next(self): pos = self.reader.reader.tell() if pos > self.region_end and self.reader._block_count == 0: raise StopIteration record = next(self.reader) return pos, record def get_progress(self): """ Give a rough estimate of the progress done. """ pos = self.reader.reader.tell() return min((pos - self.region_start) / float(self.region_end - self.region_start), 1.0) # FIXME this is just an example with no error checking class AvroWriter(RecordWriter): schema = None def __init__(self, context): super(AvroWriter, self).__init__(context) job_conf = context.job_conf part = int(job_conf['mapreduce.task.partition']) outdir = job_conf["mapreduce.task.output.dir"] outfn = "%s/part-r-%05d.avro" % (outdir, part) wh = hdfs.open(outfn, "w") self.writer = DataFileWriter(wh, DatumWriter(), self.schema) def close(self): self.writer.close() # FIXME do we really need to explicitly close the filesystem? self.writer.writer.fs.close() ================================================ FILE: pydoop/hadoop_utils.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT # DEV NOTE: this module is used by the setup script, so it MUST be # importable even if Pydoop has not been installed (yet). """ Tools for retrieving Hadoop-related information. """ import os import glob import re import platform import subprocess import xml.dom.minidom as dom from xml.parsers.expat import ExpatError class HadoopXMLError(Exception): pass def extract_text(node): return "".join( c.data.strip() for c in node.childNodes if c.nodeType == c.TEXT_NODE ) def parse_hadoop_conf_file(fn): items = [] try: doc = dom.parse(fn) except ExpatError as e: raise HadoopXMLError("not a valid XML file (%s)" % e) conf = doc.documentElement if conf.nodeName != "configuration": raise HadoopXMLError("not a valid Hadoop configuration file") props = [n for n in conf.childNodes if n.nodeName == "property"] nv = {} for p in props: for n in p.childNodes: if n.childNodes: nv[n.nodeName] = extract_text(n) try: items.append((nv["name"], nv["value"])) except KeyError: pass return dict(items) class PathFinder(object): """ Encapsulates the logic to find paths and other info required by Pydoop. """ def __init__(self): self.__hadoop_home = None self.__hadoop_conf = None self.__hadoop_params = None self.__hadoop_classpath = None self.__is_local = None def reset(self): self.__init__() # note that this can be None even after trying detection def hadoop_home(self): if not self.__hadoop_home: hh = os.getenv("HADOOP_HOME", os.getenv("HADOOP_PREFIX")) if not hh: exe = subprocess.check_output( "command -v hadoop", shell=True, universal_newlines=True ).strip() candidate, child = os.path.split(os.path.dirname(exe)) if child == "bin" and os.path.isdir(candidate): hh = os.environ["HADOOP_HOME"] = candidate self.__hadoop_home = hh return self.__hadoop_home def hadoop_conf(self): if not self.__hadoop_conf: error = "Hadoop config not found, try setting HADOOP_CONF_DIR" try: self.__hadoop_conf = os.environ["HADOOP_CONF_DIR"] except KeyError: hh = self.hadoop_home() if not hh: raise RuntimeError(error) candidate = os.path.join(hh, 'etc', 'hadoop') if not os.path.isdir(candidate): raise RuntimeError(error) self.__hadoop_conf = os.environ["HADOOP_CONF_DIR"] = candidate return self.__hadoop_conf def hadoop_params(self): if not self.__hadoop_params: params = {} hadoop_conf = self.hadoop_conf() for n in "hadoop", "core", "hdfs", "mapred": fn = os.path.join(hadoop_conf, "%s-site.xml" % n) try: params.update(parse_hadoop_conf_file(fn)) except (IOError, HadoopXMLError): pass # silently ignore, as in Hadoop self.__hadoop_params = params return self.__hadoop_params def hadoop_classpath(self): if not self.__hadoop_classpath: cp = subprocess.check_output( "hadoop classpath --glob", shell=True, universal_newlines=True ).strip() # older hadoop versions ignore --glob if 'hadoop-common' not in cp: cp = ':'.join(':'.join(glob.iglob(_)) for _ in cp.split(':')) self.__hadoop_classpath = cp return self.__hadoop_classpath def __get_is_local(self): conf = self.hadoop_params() keys = ('mapreduce.framework.name', 'mapreduce.jobtracker.address', 'mapred.job.tracker') for k in keys: if conf.get(k, 'local').lower() != 'local': return False return True def is_local(self): """\ Is Hadoop configured to run in local mode? By default, it is. [pseudo-]distributed mode must be explicitly configured. """ if self.__is_local is None: self.__is_local = self.__get_is_local() return self.__is_local ================================================ FILE: pydoop/hadut.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Provides access to some functionalities available via the Hadoop shell. """ import os import shlex import subprocess import pydoop.utils.misc as utils import pydoop.hdfs as hdfs from .utils.py3compat import basestring # --- FIXME: perhaps we need a more sophisticated tool for setting args --- GENERIC_ARGS = frozenset([ "-conf", "-D", "-fs", "-jt", "-files", "-libjars", "-archives" ]) CSV_ARGS = frozenset([ "-files", "-libjars", "-archives" ]) # generic args must go before command-specific args def _pop_generic_args(args): generic_args = [] i = len(args) - 1 while i >= 0: if args[i] in GENERIC_ARGS: try: args[i + 1] except IndexError: raise ValueError("option %s has no value" % args[i]) generic_args.extend(args[i: i + 2]) del args[i: i + 2] i -= 1 return generic_args # -files f1 -files f2 --> -files f1,f2 def _merge_csv_args(args): merge_map = {} i = len(args) - 1 while i >= 0: if args[i] in CSV_ARGS: try: args[i + 1] except IndexError: raise ValueError("option %s has no value" % args[i]) k, v = args[i: i + 2] merge_map.setdefault(k, []).append(v.strip()) del args[i: i + 2] i -= 1 for k, vlist in merge_map.items(): args.extend([k, ",".join(vlist)]) # FIXME: the above functions share a lot of code # ------------------------------------------------------------------------- def _construct_property_args(prop_dict): return sum((['-D', '%s=%s' % p] for p in prop_dict.items()), []) # inherits from RuntimeError for backwards compatibility class RunCmdError(RuntimeError): """ Raised by :func:`run_tool_cmd` and all functions that make use of it to indicate that the call failed (returned non-zero). """ def __init__(self, returncode, cmd, output=None): RuntimeError.__init__(self, output) self.returncode = returncode self.cmd = cmd def __str__(self): m = RuntimeError.__str__(self) if m: return m # mimic old run_cmd behaviour else: return "Command '%s' returned non-zero exit status %d" % ( self.cmd, self.returncode ) # keep_streams must default to True for backwards compatibility def run_tool_cmd(tool, cmd, args=None, properties=None, hadoop_conf_dir=None, logger=None, keep_streams=True): """ Run a Hadoop command. If ``keep_streams`` is set to :obj:`True` (the default), the stdout and stderr of the command will be buffered in memory. If the command succeeds, the former will be returned; if it fails, a ``RunCmdError`` will be raised with the latter as the message. This mode is appropriate for short-running commands whose "result" is represented by their standard output (e.g., ``rval = run_tool_cmd("hdfs", "dfsadmin", ["-safemode", "get"])``). If ``keep_streams`` is set to :obj:`False`, the command will write directly to the stdout and stderr of the calling process, and the return value will be empty. This mode is appropriate for long running commands that do not write their "real" output to stdout. """ if logger is None: logger = utils.NullLogger() _args = [tool] if hadoop_conf_dir: _args.extend(["--config", hadoop_conf_dir]) _args.append(cmd) if properties: _args.extend(_construct_property_args(properties)) if args: if isinstance(args, basestring): args = shlex.split(args) _merge_csv_args(args) gargs = _pop_generic_args(args) for seq in gargs, args: _args.extend(map(str, seq)) logger.debug('final args: %r', (_args,)) if keep_streams: p = subprocess.Popen( _args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, ) output, error = p.communicate() else: p = subprocess.Popen(_args, stdout=None, stderr=None, bufsize=1) ret = p.wait() error = 'command exited with %d status' % ret if ret else '' output = '' if p.returncode: raise RunCmdError(p.returncode, ' '.join(_args), error) return output def run_cmd(cmd, args=None, properties=None, hadoop_home=None, hadoop_conf_dir=None, logger=None, keep_streams=True): """ Runs the ``hadoop`` command. Calls :func:`run_tool_cmd` with ``"hadoop"`` as the first argument. """ return run_tool_cmd("hadoop", cmd, args=args, properties=properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger, keep_streams=keep_streams) def run_class(class_name, args=None, properties=None, classpath=None, hadoop_conf_dir=None, logger=None, keep_streams=True): """ Run a Java class with Hadoop (equivalent of running ``hadoop `` from the command line). Additional ``HADOOP_CLASSPATH`` elements can be provided via ``classpath`` (either as a non-string sequence where each element is a classpath element or as a ``':'``-separated string). Other arguments are passed to :func:`run_cmd`. .. note:: ``HADOOP_CLASSPATH`` makes dependencies available **only on the client side**. If you are running a MapReduce application, use ``args=['-libjars', 'jar1,jar2,...']`` to make them available to the server side as well. """ if logger is None: logger = utils.NullLogger() old_classpath = None if classpath: old_classpath = os.getenv('HADOOP_CLASSPATH', '') if isinstance(classpath, basestring): classpath = [classpath] # Prepend the classpaths provided by the user to the existing # HADOOP_CLASSPATH value. Order matters. We could work a little # harder to avoid duplicates, but it's not essential os.environ['HADOOP_CLASSPATH'] = ":".join( classpath + old_classpath.split(':', 1) ) logger.debug('HADOOP_CLASSPATH: %r', os.getenv('HADOOP_CLASSPATH')) try: res = run_cmd(class_name, args, properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger, keep_streams=keep_streams) finally: if old_classpath is not None: os.environ['HADOOP_CLASSPATH'] = old_classpath return res def iter_mr_out_files(mr_out_dir): for fn in hdfs.ls(mr_out_dir): if hdfs.path.basename(fn).startswith("part"): yield fn def collect_output(mr_out_dir, out_file=None): """ Return all mapreduce output in ``mr_out_dir``. Append the output to ``out_file`` if provided. Otherwise, return the result as a single string (it is the caller's responsibility to ensure that the amount of data retrieved fits into memory). """ if out_file is None: output = [] for fn in iter_mr_out_files(mr_out_dir): with hdfs.open(fn, "rt") as f: output.append(f.read()) return "".join(output) else: block_size = 16777216 with open(out_file, 'a') as o: for fn in iter_mr_out_files(mr_out_dir): with hdfs.open(fn) as f: data = f.read(block_size) while len(data) > 0: o.write(data) data = f.read(block_size) ================================================ FILE: pydoop/hdfs/__init__.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT r""" This module allows you to connect to an HDFS installation, read and write files and get information on files, directories and global filesystem properties. Configuration ------------- The hdfs module is built on top of ``libhdfs``, in turn a JNI wrapper around the Java fs code: therefore, for the module to work properly, the Java class path must include all relevant Hadoop jars. Pydoop tries to populate the class path automatically by calling ``hadoop classpath``, so make sure the ``hadoop`` command is in the ``PATH`` on all cluster nodes. If your Hadoop configuration directory is in a non-standard location, also ensure that the ``HADOOP_CONF_DIR`` env var is set to the appropriate value. Another important environment variable for this module is ``LIBHDFS_OPTS``, used to set options for the JVM on top of which it runs. To control the heap size, for instance, you could set ``LIBHDFS_OPTS`` to ``"-Xms32m -Xmx512m"``. """ __all__ = [ 'path', 'init', 'reset', 'hdfs', 'default_is_local', 'open', 'dump', 'load', 'cp', 'put', 'get', 'mkdir', 'rm', 'rmr', 'lsl', 'ls', 'chmod', 'move', 'chown', 'rename', 'renames', 'stat', 'lstat', 'access', 'utime', ] import os import pydoop from . import common, path from pydoop.utils.py3compat import bintype try: _ORIG_CLASSPATH except NameError: _ORIG_CLASSPATH = os.getenv("CLASSPATH", "") # --- MODULE CONFIG --- def init(): os.environ["CLASSPATH"] = "%s:%s" % ( pydoop.hadoop_classpath(), _ORIG_CLASSPATH ) init() def reset(): pydoop.reset() init() # --------------------- from .fs import hdfs, default_is_local def open(hdfs_path, mode="r", buff_size=0, replication=0, blocksize=0, user=None, encoding=None, errors=None): """ Open a file, returning an :class:`~.file.hdfs_file` object. ``hdfs_path`` and ``user`` are passed to :func:`~path.split`, while the other args are passed to the :class:`~.file.hdfs_file` constructor. """ host, port, path_ = path.split(hdfs_path, user) fs = hdfs(host, port, user) return fs.open_file(path_, mode, buff_size, replication, blocksize, encoding, errors) def dump(data, hdfs_path, **kwargs): """\ Write ``data`` to ``hdfs_path``. Keyword arguments are passed to :func:`open`, except for ``mode``, which is forced to ``"w"`` (or ``"wt"`` for text data). """ kwargs["mode"] = "w" if isinstance(data, bintype) else "wt" with open(hdfs_path, **kwargs) as fo: i = 0 bufsize = common.BUFSIZE while i < len(data): fo.write(data[i: i + bufsize]) i += bufsize fo.fs.close() def load(hdfs_path, **kwargs): """\ Read the content of ``hdfs_path`` and return it. Keyword arguments are passed to :func:`open`. The `"mode"` kwarg must be readonly. """ m, _ = common.parse_mode(kwargs.get("mode", "r")) if m != "r": raise ValueError("opening mode must be readonly") with open(hdfs_path, **kwargs) as fi: data = fi.read() fi.fs.close() return data def _cp_file(src_fs, src_path, dest_fs, dest_path, **kwargs): kwargs.pop("mode", None) kwargs["mode"] = "r" with src_fs.open_file(src_path, **kwargs) as fi: kwargs["mode"] = "w" with dest_fs.open_file(dest_path, **kwargs) as fo: bufsize = common.BUFSIZE while 1: chunk = fi.read(bufsize) if chunk: fo.write(chunk) else: break def cp(src_hdfs_path, dest_hdfs_path, **kwargs): """\ Copy the contents of ``src_hdfs_path`` to ``dest_hdfs_path``. If ``src_hdfs_path`` is a directory, its contents will be copied recursively. Source file(s) are opened for reading and copies are opened for writing. Additional keyword arguments, if any, are handled like in :func:`open`. """ src, dest = {}, {} try: for d, p in ((src, src_hdfs_path), (dest, dest_hdfs_path)): d["host"], d["port"], d["path"] = path.split(p) d["fs"] = hdfs(d["host"], d["port"]) # --- does src exist? --- try: src["info"] = src["fs"].get_path_info(src["path"]) except IOError: raise IOError("no such file or directory: %r" % (src["path"])) # --- src exists. Does dest exist? --- try: dest["info"] = dest["fs"].get_path_info(dest["path"]) except IOError: if src["info"]["kind"] == "file": _cp_file(src["fs"], src["path"], dest["fs"], dest["path"], **kwargs) return else: dest["fs"].create_directory(dest["path"]) dest_hdfs_path = dest["fs"].get_path_info(dest["path"])["name"] for item in src["fs"].list_directory(src["path"]): cp(item["name"], dest_hdfs_path, **kwargs) return # --- dest exists. Is it a file? --- if dest["info"]["kind"] == "file": raise IOError("%r already exists" % (dest["path"])) # --- dest is a directory --- dest["path"] = path.join(dest["path"], path.basename(src["path"])) if dest["fs"].exists(dest["path"]): raise IOError("%r already exists" % (dest["path"])) if src["info"]["kind"] == "file": _cp_file(src["fs"], src["path"], dest["fs"], dest["path"], **kwargs) else: dest["fs"].create_directory(dest["path"]) dest_hdfs_path = dest["fs"].get_path_info(dest["path"])["name"] for item in src["fs"].list_directory(src["path"]): cp(item["name"], dest_hdfs_path, **kwargs) finally: for d in src, dest: try: d["fs"].close() except KeyError: pass def put(src_path, dest_hdfs_path, **kwargs): """\ Copy the contents of ``src_path`` to ``dest_hdfs_path``. ``src_path`` is forced to be interpreted as an ordinary local path (see :func:`~path.abspath`). The source file is opened for reading and the copy is opened for writing. Additional keyword arguments, if any, are handled like in :func:`open`. """ cp(path.abspath(src_path, local=True), dest_hdfs_path, **kwargs) def get(src_hdfs_path, dest_path, **kwargs): """\ Copy the contents of ``src_hdfs_path`` to ``dest_path``. ``dest_path`` is forced to be interpreted as an ordinary local path (see :func:`~path.abspath`). The source file is opened for reading and the copy is opened for writing. Additional keyword arguments, if any, are handled like in :func:`open`. """ cp(src_hdfs_path, path.abspath(dest_path, local=True), **kwargs) def mkdir(hdfs_path, user=None): """ Create a directory and its parents as needed. """ host, port, path_ = path.split(hdfs_path, user) fs = hdfs(host, port, user) retval = fs.create_directory(path_) fs.close() return retval def rm(hdfs_path, recursive=True, user=None): """ Remove a file or directory. If ``recursive`` is :obj:`True` (the default), directory contents are removed recursively. """ host, port, path_ = path.split(hdfs_path, user) fs = hdfs(host, port, user) retval = fs.delete(path_, recursive=recursive) fs.close() return retval # backwards compatibility def rmr(hdfs_path, user=None): return rm(hdfs_path, recursive=True, user=user) def lsl(hdfs_path, user=None, recursive=False): """ Return a list of dictionaries of file properties. If ``hdfs_path`` is a file, there is only one item corresponding to the file itself; if it is a directory and ``recursive`` is :obj:`False`, each list item corresponds to a file or directory contained by it; if it is a directory and ``recursive`` is :obj:`True`, the list contains one item for every file or directory in the tree rooted at ``hdfs_path``. """ host, port, path_ = path.split(hdfs_path, user) fs = hdfs(host, port, user) if not recursive: dir_list = fs.list_directory(path_) else: treewalk = fs.walk(path_) top = next(treewalk) if top['kind'] == 'directory': dir_list = list(treewalk) else: dir_list = [top] fs.close() return dir_list def ls(hdfs_path, user=None, recursive=False): """ Return a list of hdfs paths. Works in the same way as :func:`lsl`, except for the fact that list items are hdfs paths instead of dictionaries of properties. """ dir_list = lsl(hdfs_path, user, recursive) return [d["name"] for d in dir_list] def chmod(hdfs_path, mode, user=None): """ Change file mode bits. :type path: string :param path: the path to the file or directory :type mode: int :param mode: the bitmask to set it to (e.g., 0777) """ host, port, path_ = path.split(hdfs_path, user) fs = hdfs(host, port, user) retval = fs.chmod(path_, mode) fs.close() return retval def move(src, dest, user=None): """ Move or rename src to dest. """ src_host, src_port, src_path = path.split(src, user) dest_host, dest_port, dest_path = path.split(dest, user) src_fs = hdfs(src_host, src_port, user) dest_fs = hdfs(dest_host, dest_port, user) try: retval = src_fs.move(src_path, dest_fs, dest_path) return retval finally: src_fs.close() dest_fs.close() def chown(hdfs_path, user=None, group=None, hdfs_user=None): """ See :meth:`fs.hdfs.chown`. """ user = user or '' group = group or '' host, port, path_ = path.split(hdfs_path, hdfs_user) with hdfs(host, port, hdfs_user) as fs: return fs.chown(path_, user=user, group=group) def rename(from_path, to_path, user=None): """ See :meth:`fs.hdfs.rename`. """ fhost, fport, fpath = path.split(from_path, user) thost, tport, tpath = path.split(to_path, user) with hdfs(thost, tport, user) as fs: chost, cport = fs.host, fs.port with hdfs(fhost, fport, user) as fs: if fs.host != chost or fs.port != cport: raise RuntimeError("can't do a cross-fs rename") return fs.rename(fpath, tpath) def renames(from_path, to_path, user=None): """ Rename ``from_path`` to ``to_path``, creating parents as needed. """ to_dir = path.dirname(to_path) if to_dir: mkdir(to_dir, user=user) rename(from_path, to_path, user=user) # direct bindings stat = path.stat lstat = path.lstat access = path.access utime = path.utime ================================================ FILE: pydoop/hdfs/common.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Common hdfs utilities. """ import getpass import pwd import grp import sys __is_py3 = sys.version_info >= (3, 0) BUFSIZE = 16384 DEFAULT_PORT = 8020 # org/apache/hadoop/hdfs/server/namenode/NameNode.java DEFAULT_USER = getpass.getuser() # Unicode objects are encoded using this encoding: TEXT_ENCODING = 'utf-8' # We use UTF-8 since this is what the Hadoop TextFileFormat uses # NOTE: If you change this, you'll also need to fix the encoding # used by the native extension. BASE_MODES = frozenset("rwa") def parse_mode(mode): try: base_mode = mode[0] except IndexError: raise ValueError("mode cannot be empty") if base_mode not in BASE_MODES: raise ValueError("base mode must be one of %s" % ", ".join(BASE_MODES)) try: is_text = mode[1] == "t" except IndexError: is_text = False return base_mode, is_text if __is_py3: def encode_path(path): return path def decode_path(path): return path def encode_host(host): return host def decode_host(host): return host else: def encode_path(path): if isinstance(path, unicode): # noqa: F821 path = path.encode('utf-8') return path def decode_path(path): if isinstance(path, str): path = path.decode('utf-8') return path def encode_host(host): if isinstance(host, unicode): # noqa: F821 host = host.encode('idna') return host def decode_host(host): if isinstance(host, str): host = host.decode('idna') return host def get_groups(user=DEFAULT_USER): groups = set(_.gr_name for _ in grp.getgrall() if user in set(_.gr_mem)) primary_gid = pwd.getpwnam(user).pw_gid groups.add(grp.getgrgid(primary_gid).gr_name) return groups ================================================ FILE: pydoop/hdfs/core/__init__.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ HDFS core implementation. """ import os def init(): import pydoop.utils.jvm as jvm jvm.load_jvm_lib() try: # NOTE: JVM must be already instantiated import pydoop.native_core_hdfs except ImportError: return None # should only happen at compile time else: return pydoop.native_core_hdfs def core_hdfs_fs(host, port, user): _CORE_MODULE = init() if _CORE_MODULE is None: if os.path.isdir("pydoop"): msg = "Trying to import from the source directory?" else: msg = "Check that Pydoop is correctly installed" raise RuntimeError("Core module unavailable. %s" % msg) return _CORE_MODULE.CoreHdfsFs(host, port, user) ================================================ FILE: pydoop/hdfs/file.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ pydoop.hdfs.file -- HDFS File Objects ------------------------------------- """ import os import io import codecs from pydoop.hdfs import common def _complain_ifclosed(closed): if closed: raise ValueError("I/O operation on closed HDFS file object") class FileIO(object): """ Instances of this class represent HDFS file objects. Objects from this class should not be instantiated directly. To open an HDFS file, use :meth:`~.fs.hdfs.open_file`, or the top-level ``open`` function in the hdfs package. """ ENCODING = "utf-8" ERRORS = "strict" def __init__(self, raw_hdfs_file, fs, mode, encoding=None, errors=None): self.mode = mode self.base_mode, is_text = common.parse_mode(self.mode) self.buff_size = raw_hdfs_file.buff_size if self.buff_size <= 0: self.buff_size = common.BUFSIZE if is_text: self.__encoding = encoding or self.__class__.ENCODING self.__errors = errors or self.__class__.ERRORS try: codecs.lookup(self.__encoding) codecs.lookup_error(self.__errors) except LookupError as e: raise ValueError(e) else: if encoding: raise ValueError( "binary mode doesn't take an encoding argument") if errors: raise ValueError("binary mode doesn't take an errors argument") self.__encoding = self.__errors = None cls = io.BufferedReader if self.base_mode == "r" else io.BufferedWriter self.f = cls(raw_hdfs_file, buffer_size=self.buff_size) self.__fs = fs info = fs.get_path_info(self.f.raw.name) self.__name = info["name"] self.__size = info["size"] self.closed = False def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() @property def fs(self): """ The file's hdfs instance. """ return self.__fs @property def name(self): """ The file's fully qualified name. """ return self.__name @property def size(self): """ The file's size in bytes. This attribute is initialized when the file is opened and updated when it is closed. """ return self.__size def writable(self): return self.f.raw.writable() def readline(self): """ Read and return a line of text. :rtype: str :return: the next line of text in the file, including the newline character """ _complain_ifclosed(self.closed) line = self.f.readline() if self.__encoding: return line.decode(self.__encoding, self.__errors) else: return line def next(self): """ Return the next input line, or raise :class:`StopIteration` when EOF is hit. """ return self.__next__() def __next__(self): """ Return the next input line, or raise :class:`StopIteration` when EOF is hit. """ _complain_ifclosed(self.closed) line = self.readline() if not line: raise StopIteration return line def __iter__(self): return self def available(self): """ Number of bytes that can be read from this input stream without blocking. :rtype: int :return: available bytes """ _complain_ifclosed(self.closed) return self.f.raw.available() def close(self): """ Close the file. """ if not self.closed: self.closed = True retval = self.f.close() if self.base_mode != "r": self.__size = self.fs.get_path_info(self.name)["size"] return retval def pread(self, position, length): r""" Read ``length`` bytes of data from the file, starting from ``position``\ . :type position: int :param position: position from which to read :type length: int :param length: the number of bytes to read :rtype: string :return: the chunk of data read from the file """ _complain_ifclosed(self.closed) if position > self.size: raise IOError("position cannot be past EOF") if length < 0: length = self.size - position data = self.f.raw.pread(position, length) if self.__encoding: return data.decode(self.__encoding, self.__errors) else: return data def read(self, length=-1): """ Read ``length`` bytes from the file. If ``length`` is negative or omitted, read all data until EOF. :type length: int :param length: the number of bytes to read :rtype: string :return: the chunk of data read from the file """ _complain_ifclosed(self.closed) # NOTE: libhdfs read stops at block boundaries: it is *essential* # to ensure that we actually read the required number of bytes. if length < 0: length = self.size chunks = [] while 1: if length <= 0: break c = self.f.read(min(self.buff_size, length)) if c == b"": break chunks.append(c) length -= len(c) data = b"".join(chunks) if self.__encoding: return data.decode(self.__encoding, self.__errors) else: return data def seek(self, position, whence=os.SEEK_SET): """ Seek to ``position`` in file. :type position: int :param position: offset in bytes to seek to :type whence: int :param whence: defaults to ``os.SEEK_SET`` (absolute); other values are ``os.SEEK_CUR`` (relative to the current position) and ``os.SEEK_END`` (relative to the file's end). """ _complain_ifclosed(self.closed) return self.f.seek(position, whence) def tell(self): """ Get the current byte offset in the file. :rtype: int :return: current offset in bytes """ _complain_ifclosed(self.closed) return self.f.tell() def write(self, data): """ Write ``data`` to the file. :type data: bytes :param data: the data to be written to the file :rtype: int :return: the number of bytes written """ _complain_ifclosed(self.closed) if self.__encoding: self.f.write(data.encode(self.__encoding, self.__errors)) return len(data) else: return self.f.write(data) def flush(self): """ Force any buffered output to be written. """ _complain_ifclosed(self.closed) return self.f.flush() class hdfs_file(FileIO): def pread_chunk(self, position, chunk): r""" Works like :meth:`pread`\ , but data is stored in the writable buffer ``chunk`` rather than returned. Reads at most a number of bytes equal to the size of ``chunk``\ . :type position: int :param position: position from which to read :type chunk: buffer :param chunk: a writable object that supports the buffer protocol :rtype: int :return: the number of bytes read """ _complain_ifclosed(self.closed) if position > self.size: raise IOError("position cannot be past EOF") return self.f.raw.pread_chunk(position, chunk) def read_chunk(self, chunk): r""" Works like :meth:`read`\ , but data is stored in the writable buffer ``chunk`` rather than returned. Reads at most a number of bytes equal to the size of ``chunk``\ . :type chunk: buffer :param chunk: a writable object that supports the buffer protocol :rtype: int :return: the number of bytes read """ _complain_ifclosed(self.closed) return self.f.readinto(chunk) class local_file(io.FileIO): """\ Support class to handle local files. Object of this type have the same interface as :class:`FileIO` (and should also be obtained via higher level methods rather than instantiated directly), but act as handles to local files. """ def __init__(self, fs, name, mode): if not mode.startswith("r"): local_file.__make_parents(fs, name) super(local_file, self).__init__(name, mode) name = os.path.abspath(name) self.__fs = fs self.__size = os.fstat(super(local_file, self).fileno()).st_size self.f = self self.buff_size = io.DEFAULT_BUFFER_SIZE @staticmethod def __make_parents(fs, name): d = os.path.dirname(name) if d: try: fs.create_directory(d) except IOError: raise IOError("Cannot open file %s" % name) @property def fs(self): return self.__fs @property def size(self): return self.__size def available(self): _complain_ifclosed(self.closed) return self.size def close(self): if self.writable(): self.flush() os.fsync(self.fileno()) self.__size = os.fstat(self.fileno()).st_size super(local_file, self).close() def seek(self, position, whence=os.SEEK_SET): if position > self.__size: raise IOError("position cannot be past EOF") return super(local_file, self).seek(position, whence) def __seek_and_read(self, position, length=None, buf=None): assert (length is None) != (buf is None) _complain_ifclosed(self.closed) old_pos = self.tell() self.seek(position) if buf is not None: ret = self.readinto(buf) else: if length < 0: length = self.size - position ret = self.read(length) self.seek(old_pos) return ret def pread(self, position, length): return self.__seek_and_read(position, length=length) def pread_chunk(self, position, chunk): return self.__seek_and_read(position, buf=chunk) def read_chunk(self, chunk): _complain_ifclosed(self.closed) return self.readinto(chunk) class TextIOWrapper(io.TextIOWrapper): def __getattr__(self, name): # there is no readinto method in text mode (strings are immutable) if name.endswith("_chunk"): raise AttributeError("%r object has no attribute %r" % ( self.__class__.__name__, name )) a = getattr(self.buffer.raw, name) if name == "mode": a = "%st" % self.buffer.raw.mode[0] return a def pread(self, position, length): data = self.buffer.raw.pread(position, length) return data.decode(self.encoding, self.errors) ================================================ FILE: pydoop/hdfs/fs.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ pydoop.hdfs.fs -- File System Handles ------------------------------------- """ import os import socket import getpass import re import operator as ops import io import pydoop from . import common from .file import FileIO, hdfs_file, local_file, TextIOWrapper from .core import core_hdfs_fs # py3 compatibility from functools import reduce try: from urllib.parse import urlparse except ImportError: from urlparse import urlparse class _FSStatus(object): def __init__(self, fs, host, port, user, refcount=1): self.fs = fs self.host = host self.port = port self.user = user self.refcount = refcount def __repr__(self): return "_FSStatus(%s, %s)" % (self.fs, self.refcount) def _complain_ifclosed(closed): if closed: raise ValueError("I/O operation on closed HDFS instance") def _get_ip(host, default=None): try: ip = socket.gethostbyname(host) except socket.gaierror: ip = "0.0.0.0" # same as socket.gethostbyname("") return ip if ip != "0.0.0.0" else default def _get_connection_info(host, port, user): fs = core_hdfs_fs(host, port, user) res = urlparse(fs.get_working_directory()) if not res.scheme or res.scheme == "file": h, p, u = "", 0, getpass.getuser() fs.set_working_directory(os.getcwd()) # libhdfs "remembers" old cwd else: try: h, p = res.netloc.split(":") except ValueError: h, p = res.netloc, common.DEFAULT_PORT # try to find an IP address if we can't extract it from res.netloc if not res.netloc: hosts = fs.get_hosts(str(res.path), 0, 0) if hosts and hosts[0] and hosts[0][0]: h, p = hosts[0][0], common.DEFAULT_PORT u = res.path.split("/", 2)[2] return h, int(p), u, fs def _default_fs(): params = pydoop.hadoop_params() _fs = params.get("fs.defaultFS", params.get("fs.default.name", "file:///")) return urlparse(_fs) def default_is_local(): """\ Is Hadoop configured to use the local file system? By default, it is. A DFS must be explicitly configured. """ _fs = _default_fs() return _fs.scheme == "file" class hdfs(object): """ A handle to an HDFS instance. :type host: str :param host: hostname or IP address of the HDFS NameNode. Set to an empty string (and ``port`` to 0) to connect to the local file system; set to ``'default'`` (and ``port`` to 0) to connect to the default (i.e., the one defined in the Hadoop configuration files) file system. :type port: int :param port: the port on which the NameNode is listening :type user: str :param user: the Hadoop domain user name. Defaults to the current UNIX user. Note that, in MapReduce applications, since tasks are spawned by the JobTracker, the default user will be the one that started the JobTracker itself. :type groups: list :param groups: ignored. Included for backwards compatibility. **Note:** when connecting to the local file system, ``user`` is ignored (i.e., it will always be the current UNIX user). """ _CACHE = {} _ALIASES = {"host": {}, "port": {}, "user": {}} def __canonize_hpu(self, hpu): host, port, user = hpu host = self._ALIASES["host"].get(host, host) port = self._ALIASES["port"].get(port, port) user = self._ALIASES["user"].get(user, user) return host, port, user def __lookup(self, hpu): if hpu[0]: hpu = self.__canonize_hpu(hpu) return self._CACHE[hpu] def __eq__(self, other): """ :obj:`True` if ``self`` and ``other`` wrap the same Hadoop file system instance """ return type(self) == type(other) and self.fs == other.fs def __init__(self, host="default", port=0, user=None, groups=None): host = host.strip() raw_host = host host = common.encode_host(host) if user is None: user = "" if not host: port = 0 user = user or getpass.getuser() try: self.__status = self.__lookup((host, port, user)) except KeyError: h, p, u, fs = _get_connection_info(host, port, user) aliasing_info = [] if user else [("user", u, user)] if h != "": aliasing_info.append(("port", p, port)) ip = _get_ip(h, None) if ip: aliasing_info.append(("host", ip, h)) else: ip = h aliasing_info.append(("host", ip, host)) if raw_host != host: aliasing_info.append(("host", ip, raw_host)) for k, true_x, x in aliasing_info: if true_x != x: self._ALIASES[k][x] = true_x try: self.__status = self.__lookup((h, p, u)) except KeyError: self.__status = _FSStatus(fs, h, p, u, refcount=0) self._CACHE[(ip, p, u)] = self.__status self.__status.refcount += 1 def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() @property def fs(self): return self.__status.fs @property def refcount(self): return self.__status.refcount @property def host(self): """ The actual hdfs hostname (empty string for the local fs). """ return self.__status.host @property def port(self): """ The actual hdfs port (0 for the local fs). """ return self.__status.port @property def user(self): """ The user associated with this HDFS connection. """ return self.__status.user def close(self): """ Close the HDFS handle (disconnect). """ self.__status.refcount -= 1 if self.refcount == 0: self.fs.close() for k, status in list(self._CACHE.items()): # yes, we want a copy if status.refcount == 0: del self._CACHE[k] @property def closed(self): return self.__status.refcount == 0 def open_file(self, path, mode="r", buff_size=0, replication=0, blocksize=0, encoding=None, errors=None): """ Open an HDFS file. Supported opening modes are "r", "w", "a". In addition, a trailing "t" can be added to specify text mode (e.g., "rt" = open for reading text). Pass 0 as ``buff_size``, ``replication`` or ``blocksize`` if you want to use the "configured" values, i.e., the ones set in the Hadoop configuration files. :type path: str :param path: the full path to the file :type mode: str :param mode: opening mode :type buff_size: int :param buff_size: read/write buffer size in bytes :type replication: int :param replication: HDFS block replication :type blocksize: int :param blocksize: HDFS block size :rtpye: :class:`~.file.hdfs_file` :return: handle to the open file """ _complain_ifclosed(self.closed) if not path: raise ValueError("Empty path") m, is_text = common.parse_mode(mode) if not self.host: fret = local_file(self, path, m) if is_text: cls = io.BufferedReader if m == "r" else io.BufferedWriter fret = TextIOWrapper(cls(fret), encoding, errors) return fret f = self.fs.open_file(path, m, buff_size, replication, blocksize) cls = FileIO if is_text else hdfs_file fret = cls(f, self, mode) return fret def capacity(self): """ Return the raw capacity of the filesystem. :rtype: int :return: filesystem capacity """ _complain_ifclosed(self.closed) if not self.__status.host: raise RuntimeError('Capacity is not defined for a local fs') return self.fs.get_capacity() def copy(self, from_path, to_hdfs, to_path): """ Copy file from one filesystem to another. :type from_path: str :param from_path: the path of the source file :type to_hdfs: :class:`hdfs` :param to_hdfs: destination filesystem :type to_path: str :param to_path: the path of the destination file :raises: :exc:`~exceptions.IOError` """ _complain_ifclosed(self.closed) if isinstance(to_hdfs, self.__class__): to_hdfs = to_hdfs.fs return self.fs.copy(from_path, to_hdfs, to_path) def create_directory(self, path): """ Create directory ``path`` (non-existent parents will be created as well). :type path: str :param path: the path of the directory :raises: :exc:`~exceptions.IOError` """ _complain_ifclosed(self.closed) return self.fs.create_directory(path) def default_block_size(self): """ Get the default block size. :rtype: int :return: the default blocksize """ _complain_ifclosed(self.closed) return self.fs.get_default_block_size() def delete(self, path, recursive=True): """ Delete ``path``. :type path: str :param path: the path of the file or directory :type recursive: bool :param recursive: if ``path`` is a directory, delete it recursively when :obj:`True` :raises: :exc:`~exceptions.IOError` when ``recursive`` is :obj:`False` and directory is non-empty """ _complain_ifclosed(self.closed) return self.fs.delete(path, recursive) def exists(self, path): """ Check if a given path exists on the filesystem. :type path: str :param path: the path to look for :rtype: bool :return: :obj:`True` if ``path`` exists """ _complain_ifclosed(self.closed) return self.fs.exists(path) def get_hosts(self, path, start, length): """ Get hostnames where a particular block (determined by pos and blocksize) of a file is stored. Due to replication, a single block could be present on multiple hosts. :type path: str :param path: the path of the file :type start: int :param start: the start of the block :type length: int :param length: the length of the block :rtype: list :return: list of hosts that store the block """ _complain_ifclosed(self.closed) return self.fs.get_hosts(path, start, length) def get_path_info(self, path): """ Get information about ``path`` as a dict of properties. The return value, based upon ``fs.FileStatus`` from the Java API, has the following fields: * ``block_size``: HDFS block size of ``path`` * ``group``: group associated with ``path`` * ``kind``: ``'file'`` or ``'directory'`` * ``last_access``: last access time of ``path`` * ``last_mod``: last modification time of ``path`` * ``name``: fully qualified path name * ``owner``: owner of ``path`` * ``permissions``: file system permissions associated with ``path`` * ``replication``: replication factor of ``path`` * ``size``: size in bytes of ``path`` :type path: str :param path: a path in the filesystem :rtype: dict :return: path information :raises: :exc:`~exceptions.IOError` """ _complain_ifclosed(self.closed) return self.fs.get_path_info(path) def list_directory(self, path): r""" Get list of files and directories for ``path``\ . :type path: str :param path: the path of the directory :rtype: list :return: list of files and directories in ``path`` :raises: :exc:`~exceptions.IOError` """ _complain_ifclosed(self.closed) return self.fs.list_directory(path) def move(self, from_path, to_hdfs, to_path): """ Move file from one filesystem to another. :type from_path: str :param from_path: the path of the source file :type from_hdfs: :class:`hdfs` :param to_hdfs: destination filesystem :type to_path: str :param to_path: the path of the destination file :raises: :exc:`~exceptions.IOError` """ _complain_ifclosed(self.closed) if isinstance(to_hdfs, self.__class__): to_hdfs = to_hdfs.fs return self.fs.move(from_path, to_hdfs, to_path) def rename(self, from_path, to_path): """ Rename file. :type from_path: str :param from_path: the path of the source file :type to_path: str :param to_path: the path of the destination file :raises: :exc:`~exceptions.IOError` """ _complain_ifclosed(self.closed) return self.fs.rename(from_path, to_path) def set_replication(self, path, replication): r""" Set the replication of ``path`` to ``replication``\ . :type path: str :param path: the path of the file :type replication: int :param replication: the replication value :raises: :exc:`~exceptions.IOError` """ _complain_ifclosed(self.closed) return self.fs.set_replication(path, replication) def set_working_directory(self, path): r""" Set the working directory to ``path``\ . All relative paths will be resolved relative to it. :type path: str :param path: the path of the directory :raises: :exc:`~exceptions.IOError` """ _complain_ifclosed(self.closed) return self.fs.set_working_directory(path) def used(self): """ Return the total raw size of all files in the filesystem. :rtype: int :return: total size of files in the file system """ _complain_ifclosed(self.closed) return self.fs.get_used() def working_directory(self): """ Get the current working directory. :rtype: str :return: current working directory """ _complain_ifclosed(self.closed) wd = self.fs.get_working_directory() return wd def chown(self, path, user='', group=''): """ Change file owner and group. :type path: str :param path: the path to the file or directory :type user: str :param user: Hadoop username. Set to '' if only setting group :type group: str :param group: Hadoop group name. Set to '' if only setting user :raises: :exc:`~exceptions.IOError` """ _complain_ifclosed(self.closed) return self.fs.chown(path, user, group) @staticmethod def __get_umask(): current_umask = os.umask(0) os.umask(current_umask) return current_umask def __compute_mode_from_string(self, path, mode_string): """ Scan a unix-style mode string and apply it to ``path``. :type mode_string: str :param mode_string: see ``man chmod`` for details. ``X``, ``s`` and ``t`` modes are not supported. The string should match the following regular expression: ``[ugoa]*[-+=]([rwx]*)``. :rtype: int :return: a new mode integer resulting from applying ``mode_string`` to ``path``. :raises: :exc:`~exceptions.ValueError` if ``mode_string`` is invalid. """ Char_to_perm_byte = {'r': 4, 'w': 2, 'x': 1} Fields = (('u', 6), ('g', 3), ('o', 0)) # -- m = re.match(r"\s*([ugoa]*)([-+=])([rwx]*)\s*", mode_string) if not m: raise ValueError("Invalid mode string %s" % mode_string) who = m.group(1) what_op = m.group(2) which_perm = m.group(3) # -- old_mode = self.fs.get_path_info(path)['permissions'] # The mode to be applied by the operation, repeated three # times in a list, for user, group, and other respectively. # Initially these are identical, but some may change if we # have to respect the umask setting. op_perm = [ reduce(ops.ior, [Char_to_perm_byte[c] for c in which_perm]) ] * 3 if 'a' in who: who = 'ugo' elif who == '': who = 'ugo' # erase the umask bits inverted_umask = ~self.__get_umask() for i, field in enumerate(Fields): op_perm[i] &= (inverted_umask >> field[1]) & 0x7 # for each user, compute the permission bit and set it in the mode new_mode = 0 for i, tpl in enumerate(Fields): field, shift = tpl # shift by the bits specified for the field; keep only the # 3 lowest bits old = (old_mode >> shift) & 0x7 if field in who: if what_op == '-': new = old & ~op_perm[i] elif what_op == '=': new = op_perm[i] elif what_op == '+': new = old | op_perm[i] else: raise RuntimeError( "unexpected permission operation %s" % what_op ) else: # copy the previous permissions new = old new_mode |= new << shift return new_mode def chmod(self, path, mode): """ Change file mode bits. :type path: str :param path: the path to the file or directory :type mode: int :param mode: the bitmask to set it to (e.g., 0777) :raises: :exc:`~exceptions.IOError` """ _complain_ifclosed(self.closed) try: return self.fs.chmod(path, mode) except TypeError: mode = self.__compute_mode_from_string(path, mode) return self.fs.chmod(path, mode) def utime(self, path, mtime, atime): """ Change file last access and modification times. :type path: str :param path: the path to the file or directory :type mtime: int :param mtime: new modification time in seconds :type atime: int :param atime: new access time in seconds :raises: :exc:`~exceptions.IOError` """ _complain_ifclosed(self.closed) return self.fs.utime(path, int(mtime), int(atime)) def walk(self, top): """ Generate infos for all paths in the tree rooted at ``top`` (included). The ``top`` parameter can be either an HDFS path string or a dictionary of properties as returned by :meth:`get_path_info`. :type top: str, dict :param top: an HDFS path or path info dict :rtype: iterator :return: path infos of files and directories in the tree rooted at ``top`` :raises: :exc:`~exceptions.IOError`; :exc:`~exceptions.ValueError` if ``top`` is empty """ if not top: raise ValueError("Empty path") if not isinstance(top, dict): top = self.get_path_info(top) yield top if top['kind'] == 'directory': for info in self.list_directory(top['name']): for item in self.walk(info): yield item ================================================ FILE: pydoop/hdfs/path.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ pydoop.hdfs.path -- Path Name Manipulations ------------------------------------------- """ import os import re import time from . import common, fs as hdfs_fs from pydoop.utils.py3compat import clong curdir, pardir, sep = '.', '..', '/' # pylint: disable=C0103 class StatResult(object): """ Mimics the object type returned by :func:`os.stat`. Objects of this class are instantiated from dictionaries with the same structure as the ones returned by :meth:`~.fs.hdfs.get_path_info`. Attributes starting with ``st_`` have the same meaning as the corresponding ones in the object returned by :func:`os.stat`, although some of them may not make sense for an HDFS path (in this case, their value will be set to 0). In addition, the ``kind``, ``name`` and ``replication`` attributes are available, with the same values as in the input dict. """ def __init__(self, path_info): self.st_mode = path_info['permissions'] self.st_ino = 0 self.st_dev = clong(0) self.st_nlink = 1 self.st_uid = path_info['owner'] self.st_gid = path_info['group'] self.st_size = path_info['size'] self.st_atime = path_info['last_access'] self.st_mtime = path_info['last_mod'] self.st_ctime = 0 # -- self.st_blksize = path_info['block_size'] if self.st_blksize: n, r = divmod(path_info['size'], self.st_blksize) self.st_blocks = n + (r != 0) else: self.st_blocks = 0 # -- self.kind = path_info['kind'] self.name = path_info['name'] self.replication = path_info['replication'] def __repr__(self): names = [_ for _ in dir(self) if _.startswith('st_')] names.extend(['kind', 'name', 'replication']) return '%s(%s)' % ( self.__class__.__name__, ', '.join('%s=%r' % (_, getattr(self, _)) for _ in names) ) class _HdfsPathSplitter(object): PATTERN = re.compile(r"([a-z0-9+.-]+):(.*)") @classmethod def raise_bad_path(cls, hdfs_path, why=None): msg = "'%s' is not a valid HDFS path" % hdfs_path msg += " (%s)" % why if why else "" raise ValueError(msg) @classmethod def parse(cls, hdfs_path): if not hdfs_path: return "", "", "" try: scheme, rest = cls.PATTERN.match(hdfs_path).groups() except AttributeError: scheme, rest = "", hdfs_path if not rest: cls.raise_bad_path(hdfs_path, "no scheme-specific part") if rest.startswith("//") and not rest.startswith("///"): if not scheme: cls.raise_bad_path(hdfs_path, 'null scheme') try: netloc, path = rest[2:].split("/", 1) path = "/%s" % path except ValueError: netloc, path = rest[2:], "" elif scheme and not rest.startswith('/'): cls.raise_bad_path(hdfs_path, "relative path in absolute URI") else: netloc, path = "", rest if path.startswith("/"): path = "/%s" % path.lstrip("/") return scheme, netloc, path @classmethod def unparse(cls, scheme, netloc, path): hdfs_path = [] if scheme: hdfs_path.append('%s:' % scheme.rstrip(':')) if netloc: if not scheme: raise ValueError('netloc provided, but scheme is empty') hdfs_path.append('//%s' % netloc) if hdfs_path and path and not path.startswith('/'): hdfs_path.append('/') hdfs_path.append(path) return ''.join(hdfs_path) @classmethod def split_netloc(cls, netloc): if not netloc: return "default", 0 netloc = netloc.split(":") if len(netloc) > 2: raise ValueError("netloc is not well-formed: %r" % (netloc,)) if len(netloc) < 2: return netloc[0], common.DEFAULT_PORT hostname, port = netloc try: port = int(port) except ValueError: raise ValueError( "bad netloc (port must be an integer): %r" % (netloc,) ) return hostname, port @classmethod def split(cls, hdfs_path, user): if not hdfs_path: cls.raise_bad_path(hdfs_path, "empty") scheme, netloc, path = cls.parse(hdfs_path) if not scheme: scheme = "file" if hdfs_fs.default_is_local() else "hdfs" if scheme == "hdfs": if not path: cls.raise_bad_path(hdfs_path, "path part is empty") if ":" in path: cls.raise_bad_path( hdfs_path, "':' not allowed outside netloc part" ) hostname, port = cls.split_netloc(netloc) if not path.startswith("/"): path = "/user/%s/%s" % (user, path) elif scheme == "file": hostname, port, path = "", 0, netloc + path else: cls.raise_bad_path(hdfs_path, "unsupported scheme %r" % scheme) return hostname, port, path def parse(hdfs_path): """ Parse the given path and return its components. :type hdfs_path: str :param hdfs_path: an HDFS path, e.g., ``hdfs://localhost:9000/user/me`` :rtype: tuple :return: scheme, netloc, path """ return _HdfsPathSplitter.parse(hdfs_path) def unparse(scheme, netloc, path): """ Construct a path from its three components (see :func:`parse`). """ return _HdfsPathSplitter.unparse(scheme, netloc, path) def split(hdfs_path, user=None): """ Split ``hdfs_path`` into a (hostname, port, path) tuple. :type hdfs_path: str :param hdfs_path: an HDFS path, e.g., ``hdfs://localhost:9000/user/me`` :type user: str :param user: user name used to resolve relative paths, defaults to the current user :rtype: tuple :return: hostname, port, path """ # Use a helper class to compile URL_PATTERN once and for all return _HdfsPathSplitter.split(hdfs_path, user or common.DEFAULT_USER) def join(*parts): """ Join path name components, inserting ``/`` as needed. If any component is an absolute path (see :func:`isabs`), all previous components will be discarded. However, full URIs (see :func:`isfull`) take precedence over incomplete ones: .. code-block:: python >>> import pydoop.hdfs.path as hpath >>> hpath.join('bar', '/foo') '/foo' >>> hpath.join('hdfs://host:1/', '/foo') 'hdfs://host:1/foo' Note that this is *not* the reverse of :func:`split`, but rather a specialized version of :func:`os.path.join`. No check is made to determine whether the returned string is a valid HDFS path. """ try: path = [parts[0]] except IndexError: raise TypeError("need at least one argument") for p in parts[1:]: path[-1] = path[-1].rstrip("/") full = isfull(path[0]) if isfull(p) or (isabs(p) and not full): path = [p] else: path.append(p.lstrip('/')) return "/".join(path) def abspath(hdfs_path, user=None, local=False): """ Return an absolute path for ``hdfs_path``. The ``user`` arg is passed to :func:`split`. The ``local`` argument forces ``hdfs_path`` to be interpreted as an ordinary local path: .. code-block:: python >>> import os >>> os.chdir('/tmp') >>> import pydoop.hdfs.path as hpath >>> hpath.abspath('file:/tmp') 'file:/tmp' >>> hpath.abspath('file:/tmp', local=True) 'file:/tmp/file:/tmp' Note that this function always return a full URI: .. code-block:: python >>> import pydoop.hdfs.path as hpath >>> hpath.abspath('/tmp') 'hdfs://localhost:9000/tmp' """ if local: return 'file:%s' % os.path.abspath(hdfs_path) if isfull(hdfs_path): return hdfs_path hostname, port, path = split(hdfs_path, user=user) if hostname: fs = hdfs_fs.hdfs(hostname, port) apath = join("hdfs://%s:%s" % (fs.host, fs.port), path) fs.close() else: apath = "file:%s" % os.path.abspath(path) return apath def splitpath(hdfs_path): """ Split ``hdfs_path`` into a (``head``, ``tail``) pair, according to the same rules as :func:`os.path.split`. """ return (dirname(hdfs_path), basename(hdfs_path)) def basename(hdfs_path): """ Return the final component of ``hdfs_path``. """ return os.path.basename(hdfs_path) def dirname(hdfs_path): """ Return the directory component of ``hdfs_path``. """ scheme, netloc, path = parse(hdfs_path) return unparse(scheme, netloc, os.path.dirname(path)) def exists(hdfs_path, user=None): """ Return :obj:`True` if ``hdfs_path`` exists in the default HDFS. """ hostname, port, path = split(hdfs_path, user=user) fs = hdfs_fs.hdfs(hostname, port) retval = fs.exists(path) fs.close() return retval # -- libhdfs does not support fs.FileStatus.isSymlink() -- def lstat(hdfs_path, user=None): return stat(hdfs_path, user=user) def lexists(hdfs_path, user=None): return exists(hdfs_path, user=user) # -------------------------------------------------------- def kind(path, user=None): """ Get the kind of item ("file" or "directory") that the path references. Return :obj:`None` if ``path`` doesn't exist. """ hostname, port, path = split(path, user=user) fs = hdfs_fs.hdfs(hostname, port) try: return fs.get_path_info(path)['kind'] except IOError: return None finally: fs.close() def isdir(path, user=None): """ Return :obj:`True` if ``path`` refers to a directory. """ return kind(path, user) == 'directory' def isfile(path, user=None): """ Return :obj:`True` if ``path`` refers to a file. """ return kind(path, user) == 'file' def expanduser(path): """ Replace initial ``~`` or ``~user`` with the user's home directory. **NOTE:** if the default file system is HDFS, the ``~user`` form is expanded regardless of the user's existence. """ if hdfs_fs.default_is_local(): return os.path.expanduser(path) m = re.match(r'^~([^/]*)', path) if m is None: return path user = m.groups()[0] or common.DEFAULT_USER return '/user/%s%s' % (user, path[m.end(1):]) def expandvars(path): """ Expand environment variables in ``path``. """ return os.path.expandvars(path) def _update_stat(st, path_): try: os_st = os.stat(path_) except OSError: pass else: for name in dir(os_st): if name.startswith('st_'): setattr(st, name, getattr(os_st, name)) def stat(path, user=None): """ Performs the equivalent of :func:`os.stat` on ``path``, returning a :class:`StatResult` object. """ host, port, path_ = split(path, user) fs = hdfs_fs.hdfs(host, port, user) retval = StatResult(fs.get_path_info(path_)) if not host: _update_stat(retval, path_) fs.close() return retval def getatime(path, user=None): """ Get time of last access of ``path``. """ return stat(path, user=user).st_atime def getmtime(path, user=None): """ Get time of last modification of ``path``. """ return stat(path, user=user).st_mtime def getctime(path, user=None): """ Get time of creation / last metadata change of ``path``. """ return stat(path, user=user).st_ctime def getsize(path, user=None): """ Get size, in bytes, of ``path``. """ return stat(path, user=user).st_size def isfull(path): """ Return :obj:`True` if ``path`` is a full URI (starts with a scheme followed by a colon). No check is made to determine whether ``path`` is a valid HDFS path. """ return bool(_HdfsPathSplitter.PATTERN.match(path)) def isabs(path): """ Return :obj:`True` if ``path`` is absolute. A path is absolute if it is a full URI (see :func:`isfull`) or starts with a forward slash. No check is made to determine whether ``path`` is a valid HDFS path. """ return isfull(path) or path.startswith('/') def islink(path, user=None): """ Return :obj:`True` if ``path`` is a symbolic link. Currently this function always returns :obj:`False` for non-local paths. """ host, _, path_ = split(path, user) if host: return False # libhdfs does not support fs.FileStatus.isSymlink() return os.path.islink(path_) def ismount(path): """ Return :obj:`True` if ``path`` is a mount point. This function always returns :obj:`False` for non-local paths. """ host, _, path_ = split(path, None) if host: return False return os.path.ismount(path_) def normcase(path): return path # we only support Linux / OS X def normpath(path): """ Normalize ``path``, collapsing redundant separators and up-level refs. """ scheme, netloc, path_ = parse(path) return unparse(scheme, netloc, os.path.normpath(path_)) def realpath(path): """ Return ``path`` with symlinks resolved. Currently this function returns non-local paths unchanged. """ scheme, netloc, path_ = parse(path) if scheme == 'file' or hdfs_fs.default_is_local(): return unparse(scheme, netloc, os.path.realpath(path_)) return path def samefile(path1, path2, user=None): """ Return :obj:`True` if both path arguments refer to the same path. """ def tr(p): return abspath(normpath(realpath(p)), user=user) return tr(path1) == tr(path2) def splitdrive(path): return '', path # we only support Linux / OS X def splitext(path): """ Same as :func:`os.path.splitext`. """ return os.path.splitext(path) def access(path, mode, user=None): """ Perform the equivalent of :func:`os.access` on ``path``. """ scheme = parse(path)[0] if scheme == 'file' or hdfs_fs.default_is_local(): return os.access(path, mode) if user is None: user = common.DEFAULT_USER st = stat(path) if st.st_uid == user: mode <<= 6 else: try: groups = common.get_groups(user) except KeyError: # user isn't recognized on the system. No group # information available groups = [] if st.st_gid in groups: mode <<= 3 return (st.st_mode & mode) == mode def utime(hdfs_path, times=None, user=None): atime, mtime = times or 2 * (time.time(),) hostname, port, path = split(hdfs_path, user=user) with hdfs_fs.hdfs(hostname, port) as fs: fs.utime(path, mtime, atime) ================================================ FILE: pydoop/jc.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Provides a wrapper for the JobConf object. """ def jc_wrapper(obj): """ Backward compatibility function to support pydoop 0.* applications """ return obj ================================================ FILE: pydoop/mapreduce/__init__.py ================================================ ================================================ FILE: pydoop/mapreduce/api.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ This module provides the base abstract classes used to develop MapReduce application components (:class:`Mapper`, :class:`Reducer`, etc.). """ import json from abc import abstractmethod from collections import namedtuple from pydoop.utils.py3compat import ABC # move to pydoop.properties? AVRO_IO_MODES = {'k', 'v', 'kv', 'K', 'V', 'KV'} class JobConf(dict): """ Configuration properties assigned to this job. JobConf objects are instantiated by the framework and support the same interface as dictionaries, plus a few methods that perform automatic type conversion:: >>> jc['a'] '1' >>> jc.get_int('a') 1 """ def get_int(self, key, default=None): """ Same as :meth:`dict.get`, but the value is converted to an int. """ value = self.get(key, default) return None if value is None else int(value) def get_float(self, key, default=None): """ Same as :meth:`dict.get`, but the value is converted to a float. """ value = self.get(key, default) return None if value is None else float(value) def get_bool(self, key, default=None): """ Same as :meth:`dict.get`, but the value is converted to a bool. The boolean value is considered, respectively, :obj:`True` or :obj:`False` if the string is equal, ignoring case, to ``'true'`` or ``'false'``. """ v = self.get(key, default) if v != default: v = v.strip().lower() if v == 'true': v = True elif v == 'false': v = False elif default is None: raise RuntimeError("invalid bool string: %s" % v) else: v = default return v def get_json(self, key, default=None): value = self.get(key, default) return None if value is None else json.loads(value) class InputSplit(object): """\ Represents a subset of the input data assigned to a single map task. ``InputSplit`` objects are created by the framework and made available to the user application via the ``input_split`` context attribute. """ pass class FileSplit(InputSplit, namedtuple("FileSplit", "filename, offset, length")): """\ A subset (described by offset and length) of an input file. """ pass class OpaqueSplit(InputSplit, namedtuple("OpaqueSplit", "payload")): """\ A wrapper for an arbitrary Python object. Opaque splits are created on the Python side before job submission, serialized as ``hadoop.io.Writable`` objects and stored in an HDFS file. The Java submitter reads the splits from the above file and forwards them to the Python tasks. .. note:: Opaque splits are only available when running a job via ``pydoop submit``. The HDFS path where splits are stored is specified via the ``pydoop.mapreduce.pipes.externalsplits.uri`` configuration key. """ pass class Context(ABC): """ Context objects are used for communication between the framework and the Mapreduce application. These objects are instantiated by the framework and passed to user methods as parameters:: class Mapper(api.Mapper): def map(self, context): key, value = context.key, context.value ... context.emit(new_key, new_value) """ @property def input_split(self): """\ The :class:`InputSplit` for this task (map tasks only). This tries to deserialize the raw split sent from upstream. In the most common scenario (file-based input format), the returned value will be a :class:`FileSplit`. To get the raw split, call :meth:`get_input_split` with ``raw=True``. """ return self.get_input_split() @abstractmethod def get_input_split(self, raw=False): pass @property def job_conf(self): """ MapReduce job configuration as a :class:`JobConf` object. """ return self.get_job_conf() @abstractmethod def get_job_conf(self): pass @property def key(self): """ Input key. """ return self.get_input_key() @abstractmethod def get_input_key(self): pass @property def value(self): """ Input value (map tasks only). """ return self.get_input_value() @abstractmethod def get_input_value(self): pass @property def values(self): """ Iterator over all values for the current key (reduce tasks only). """ return self.get_input_values() @abstractmethod def get_input_values(self): pass @abstractmethod def emit(self, key, value): """ Emit a key, value pair to the framework. """ pass @abstractmethod def progress(self): pass @abstractmethod def set_status(self, status): """ Set the current status. :type status: str :param status: a description of the current status """ pass @abstractmethod def get_counter(self, group, name): """ Get a :class:`Counter` from the framework. :type group: str :param group: counter group name :type name: str :param name: counter name The counter can be updated via :meth:`increment_counter`. """ pass @abstractmethod def increment_counter(self, counter, amount): """ Update a :class:`Counter` by the specified amount. """ pass class Closable(object): def close(self): """ Called after the object has finished its job. Overriding this method is **not** required. """ pass class Component(ABC): def __init__(self, context): self.context = context class Mapper(Component, Closable): """ Maps input key/value pairs to a set of intermediate key/value pairs. """ @abstractmethod def map(self, context): """ Called once for each key/value pair in the input split. Applications must override this, emitting an output key/value pair through the context. :type context: :class:`Context` :param context: the context object passed by the framework, used to get the input key/value pair and emit the output key/value pair. """ pass class Reducer(Component, Closable): """ Reduces a set of intermediate values which share a key to a (possibly) smaller set of values. """ @abstractmethod def reduce(self, context): """ Called once for each key. Applications must override this, emitting an output key/value pair through the context. :type context: :class:`Context` :param context: the context object passed by the framework, used to get the input key and corresponding set of values and emit the output key/value pair. """ pass class Combiner(Reducer): """\ A ``Combiner`` performs the same actions as a :class:`Reducer`, but it runs locally within a map task. This helps cutting down the amount of data sent to reducers across the network, with the downside that map tasks require extra memory to cache intermediate key/value pairs. The cache size is controlled by ``"mapreduce.task.io.sort.mb"`` and defaults to 100 MB. Note that it's not strictly necessary to extend this class in order to write a combiner: all that's required is that it has the same interface as a :class:`reducer`. Indeed, in many cases it's useful to set the combiner class to be the same as the reducer class. """ pass class Partitioner(Component): r""" Controls the partitioning of intermediate keys output by the :class:`Mapper`\ . The key (or a subset of it) is used to derive the partition, typically by a hash function. The total number of partitions is the same as the number of reduce tasks for the job. Hence this controls which of the *m* reduce tasks the intermediate key (and hence the record) is sent to for reduction. """ @abstractmethod def partition(self, key, num_of_reduces): r""" Get the partition number for ``key`` given the total number of partitions, i.e., the number of reduce tasks for the job. Applications must override this. :type key: str :param key: the key of the key/value pair being dispatched. :type numOfReduces: int :param numOfReduces: the total number of reduces. :rtype: int :return: the partition number for ``key``\ . """ pass class RecordReader(Component, Closable): r""" Breaks the data into key/value pairs for input to the :class:`Mapper`\ . """ def __iter__(self): return self @abstractmethod def next(self): r""" Called by the framework to provide a key/value pair to the :class:`Mapper`\ . Applications must override this, making sure it raises :exc:`~exceptions.StopIteration` when there are no more records to process. :rtype: tuple :return: a tuple of two elements. They are, respectively, the key and the value (as strings) """ raise StopIteration def __next__(self): return self.next() @abstractmethod def get_progress(self): """ The current progress of the record reader through its data. :rtype: float :return: the fraction of data read up to now, as a float between 0 and 1. """ pass class RecordWriter(Component, Closable): """ Writes the output key/value pairs to an output file. """ @abstractmethod def emit(self, key, value): """ Writes a key/value pair. Applications must override this. :type key: str :param key: a final output key :type value: str :param value: a final output value """ pass class Factory(ABC): """\ Creates MapReduce application components (e.g., mapper, reducer). A factory object must be created by the application and passed to the framework as the first argument to :func:`~.pipes.run_task`. All MapReduce applications need at least a mapper object, while other components are optional (the corresponding ``create_`` method can return :obj:`None`). Note that the reducer is optional only in map-only jobs, where the number of reduce tasks has been set to 0. :class:`~.pipes.Factory` provides a generic implementation that takes component *classes* as initialization parameters and creates component objects as needed. """ @abstractmethod def create_mapper(self, context): pass def create_reducer(self, context): return None def create_combiner(self, context): """ Create a combiner object. Return the new combiner or :obj:`None`, if one is not needed. """ return None def create_partitioner(self, context): """ Create a partitioner object. Return the new partitioner or :obj:`None`, if the default partitioner should be used. """ return None def create_record_reader(self, context): """ Create a record reader object. Return the new record reader or :obj:`None`, if the Java record reader should be used. """ return None def create_record_writer(self, context): """ Create an application record writer. Return the new record writer or :obj:`None`, if the Java record writer should be used. """ return None ================================================ FILE: pydoop/mapreduce/binary_protocol.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ Client side of the Hadoop pipes protocol. Ref: ``org.apache.hadoop.mapred.pipes.BinaryProtocol``. """ import os try: from cPickle import loads except ImportError: from pickle import loads from itertools import groupby from operator import itemgetter import pydoop.config as config from .api import AVRO_IO_MODES, JobConf PROTOCOL_VERSION = 0 # We can use an enum.IntEnum after dropping Python2 compatibility START = 0 SET_JOB_CONF = 1 SET_INPUT_TYPES = 2 RUN_MAP = 3 MAP_ITEM = 4 RUN_REDUCE = 5 REDUCE_KEY = 6 REDUCE_VALUE = 7 CLOSE = 8 ABORT = 9 AUTHENTICATION_REQ = 10 OUTPUT = 50 PARTITIONED_OUTPUT = 51 STATUS = 52 PROGRESS = 53 DONE = 54 REGISTER_COUNTER = 55 INCREMENT_COUNTER = 56 AUTHENTICATION_RESP = 57 CMD_REPR = { START: "START", SET_JOB_CONF: "SET_JOB_CONF", SET_INPUT_TYPES: "SET_INPUT_TYPES", RUN_MAP: "RUN_MAP", MAP_ITEM: "MAP_ITEM", RUN_REDUCE: "RUN_REDUCE", REDUCE_KEY: "REDUCE_KEY", REDUCE_VALUE: "REDUCE_VALUE", CLOSE: "CLOSE", ABORT: "ABORT", AUTHENTICATION_REQ: "AUTHENTICATION_REQ", OUTPUT: "OUTPUT", PARTITIONED_OUTPUT: "PARTITIONED_OUTPUT", STATUS: "STATUS", PROGRESS: "PROGRESS", DONE: "DONE", REGISTER_COUNTER: "REGISTER_COUNTER", INCREMENT_COUNTER: "INCREMENT_COUNTER", AUTHENTICATION_RESP: "AUTHENTICATION_RESP", } IS_JAVA_RW = "mapreduce.pipes.isjavarecordwriter" def get_password(): try: pass_fn = os.environ["hadoop.pipes.shared.secret.location"] except KeyError: return None with open(pass_fn, "rb") as f: return f.read() # _get_* functions to patch the downlink according to the chosen # deserialization policy (see below) def _get_LongWritable(downlink): assert downlink.stream.read_vint() == 8 return downlink.stream.read_long_writable() def _get_Text(downlink): return downlink.stream.read_string() DESERIALIZERS = { "org.apache.hadoop.io.LongWritable": _get_LongWritable, "org.apache.hadoop.io.Text": _get_Text, } def _get_avro_key(downlink): raw = downlink.stream.read_bytes() return downlink.avro_key_deserializer.deserialize(raw) def _get_avro_value(downlink): raw = downlink.stream.read_bytes() return downlink.avro_value_deserializer.deserialize(raw) def _get_pickled(downlink): return loads(downlink.stream.read_bytes()) class Downlink(object): """\ Reads and executes pipes commands as directed by upstream. The downlink drives the entire MapReduce task, plugging in user components and calling their methods as necessary. A task can be either a **map** task or a **reduce** task, but this is not known until after a few initial commands, as shown below. All tasks start with the following commands:: AUTHENTICATION_REQ START SET_JOB_CONF Map tasks follow up with:: RUN_MAP if java_reader: SET_INPUT_TYPES for k, v in input: MAP_ITEM CLOSE Reduce tasks follow up with:: RUN_REDUCE for k in input: REDUCE_KEY for v in values_for(k): REDUCE_VALUE CLOSE In both cases, the inner loop consists of handling the key/value stream. All the code involved in this process, namely: * reading and optionally deserializing input keys and values * calling user methods * emitting output keys and values back to upstream must be as efficient as possible. For this reason, rather than having the ``get_{k,v}`` methods go through a complex ``if`` tree at every call, we patch the class itself by replacing each method with the one appropriate for the current scenario. Note that we can do this because: * the deserialization policy (including no deserialization) is the same for all items of a given kind (key or value), meaning that an ``if`` tree would pick the same branch for the entire process * there is only one Downlink object in the process, so we don't risk altering the behavior of other instances * the Downlink object is not part of the client API (it's not passed to user code at all) Job conf deserialization also needs to be somewhat efficient, since it involves reading thousands of strings. """ def __init__(self, istream, context, **kwargs): self.stream = istream self.context = context self.raw_k = kwargs.get("raw_keys", False) self.raw_v = kwargs.get("raw_values", False) self.password = get_password() self.auth_done = False self.avro_key_deserializer = None self.avro_value_deserializer = None def close(self): self.stream.close() def read_job_conf(self): n = self.stream.read_vint() if n & 1: raise RuntimeError("number of items is not even") t = self.stream.read_tuple(n * 's') return JobConf(t[i: i + 2] for i in range(0, n, 2)) def verify_digest(self, digest, challenge): if self.password is not None: self.context._authenticate(self.password, digest, challenge) # self.password is None: assume reading from cmd file self.auth_done = True def setup_record_writer(self, piped_output): writer = self.context.create_record_writer() if writer and piped_output: raise RuntimeError("record writer defined when not needed") if not writer and not piped_output: raise RuntimeError("record writer not defined") def get_k(self): return self.stream.read_bytes() def get_v(self): return self.stream.read_bytes() def setup_avro_deser(self): try: from pydoop.avrolib import AvroDeserializer except ImportError as e: raise RuntimeError("cannot handle avro input: %s" % e) jc = self.context.job_conf avro_input = jc.get(config.AVRO_INPUT).upper() if avro_input not in AVRO_IO_MODES: raise RuntimeError('invalid avro input mode: %s' % avro_input) if avro_input == 'K' or avro_input == 'KV' and not self.raw_k: schema = jc.get(config.AVRO_KEY_INPUT_SCHEMA) self.avro_key_deserializer = AvroDeserializer(schema) self.__class__.get_k = _get_avro_key if avro_input == 'V' or avro_input == 'KV' and not self.raw_v: schema = jc.get(config.AVRO_VALUE_INPUT_SCHEMA) self.avro_value_deserializer = AvroDeserializer(schema) self.__class__.get_v = _get_avro_value def setup_deser(self, key_type, value_type): if not self.raw_k: d = DESERIALIZERS.get(key_type) if d is not None: self.__class__.get_k = d if not self.raw_v: d = DESERIALIZERS.get(value_type) if d is not None: self.__class__.get_v = d def __next__(self): cmd = self.stream.read_vint() if cmd != AUTHENTICATION_REQ and not self.auth_done: raise RuntimeError("%d received before authentication" % cmd) if cmd == AUTHENTICATION_REQ: digest, challenge = self.stream.read_tuple('bb') self.verify_digest(digest, challenge) elif cmd == START: v = self.stream.read_vint() if (v != PROTOCOL_VERSION): raise RuntimeError("Unknown protocol id: %d" % v) elif cmd == SET_JOB_CONF: self.context._job_conf = self.read_job_conf() if config.AVRO_OUTPUT in self.context.job_conf: self.context._setup_avro_ser() elif cmd == RUN_MAP: self.context.task_type = "m" split, nred, piped_input = self.stream.read_tuple('bii') self.context._raw_split = split reader = self.context.create_record_reader() if reader and piped_input: raise RuntimeError("record reader defined when not needed") if not reader and not piped_input: raise RuntimeError("record reader not defined") combiner = self.context.create_combiner() if nred < 1: # map-only job if combiner: raise RuntimeError("combiner defined in map-only job") self.context._private_encoding = False piped_output = self.context.job_conf.get_bool(IS_JAVA_RW) self.setup_record_writer(piped_output) self.context.nred = nred self.context.create_mapper() self.context.create_partitioner() if reader: for self.context._key, self.context._value in reader: self.context.mapper.map(self.context) self.context.progress_value = reader.get_progress() self.context.progress() # no more commands from upstream, not even CLOSE try: self.context.close() finally: raise StopIteration elif cmd == SET_INPUT_TYPES: key_type, value_type = self.stream.read_tuple('ss') if config.AVRO_INPUT in self.context.job_conf: self.setup_avro_deser() else: self.setup_deser(key_type, value_type) elif cmd == MAP_ITEM: self.context._key = self.get_k() self.context._value = self.get_v() self.context.mapper.map(self.context) elif cmd == RUN_REDUCE: self.context.task_type = "r" part, piped_output = self.stream.read_tuple('ii') # for some reason, part is always 0 self.context.create_reducer() self.setup_record_writer(piped_output) if self.context._private_encoding: self.__class__.get_k = _get_pickled self.__class__.get_v = _get_pickled for cmd, subs in groupby(self, itemgetter(0)): if cmd == REDUCE_KEY: _, self.context._key = next(subs) if cmd == REDUCE_VALUE: self.context._values = (v for _, v in subs) self.context.reducer.reduce(self.context) if cmd == CLOSE: try: self.context.close() finally: raise StopIteration elif cmd == REDUCE_KEY: k = self.get_k() return cmd, k # pass on to RUN_REDUCE iterator elif cmd == REDUCE_VALUE: v = self.get_v() return cmd, v # pass on to RUN_REDUCE iterator elif cmd == ABORT: raise RuntimeError("received ABORT command") elif cmd == CLOSE: if self.context.mapper: try: self.context.close() finally: raise StopIteration else: return cmd, None # pass on to RUN_REDUCE iterator else: raise RuntimeError("unknown command: %d" % cmd) def __iter__(self): return self # py2 compat def next(self): return self.__next__() class Uplink(object): """\ Writes all information that needs to be sent upstream. """ def __init__(self, stream): self.stream = stream def flush(self): self.stream.flush() def close(self): self.stream.close() # pipes commands def authenticate(self, response_digest): self.stream.write_tuple("ib", (AUTHENTICATION_RESP, response_digest)) def output(self, k, v): self.stream.write_output(k, v) def partitioned_output(self, part, k, v): self.stream.write_output(k, v, part) def status(self, msg): self.stream.write_tuple("is", (STATUS, msg)) def progress(self, p): self.stream.write_tuple("if", (PROGRESS, p)) def done(self): self.stream.write_vint(DONE) def register_counter(self, id, group, name): self.stream.write_tuple("iiss", (REGISTER_COUNTER, id, group, name)) def increment_counter(self, id, amount): self.stream.write_tuple("iil", (INCREMENT_COUNTER, id, amount)) ================================================ FILE: pydoop/mapreduce/connections.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ Set up communication channels with the MapReduce framework. If "mapreduce.pipes.command.port" is in the env, this is a "real" Hadoop task: we have to connect to the given port and use the socket for live communication with the Java submitter. If the above env variable is not defined, but "mapreduce.pipes.commandfile" is, a pre-compiled binary file containing the entire command list from upstream is available at the specified (local) filesystem path. """ import os import socket import pydoop.sercore as sercore from .binary_protocol import Downlink, Uplink class Connection(object): """\ Create up/down links and set up references. The ref chain is ``downlink -> context -> uplink``, where ``downlink -> context`` is an owned ref and ``context -> uplink`` is a borrowed one (owner is responsible for closing, borrower must **not** close). Other refs:: downlink -> istream (owned) uplink -> ostream (owned) connection -> downlink (owned) connection -> uplink (owned) Connection keeps no reference at all to either istream or ostream. """ def __init__(self, context, istream, ostream, **kwargs): self.uplink = context.uplink = Uplink(ostream) self.downlink = Downlink(istream, context, **kwargs) def close(self): self.uplink.close() self.downlink.close() def __enter__(self): return self def __exit__(self, *args): self.close() class NetworkConnection(Connection): def __init__(self, context, host, port, **kwargs): self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.socket.connect((host, port)) istream = sercore.FileInStream(self.socket) ostream = sercore.FileOutStream(self.socket) super(NetworkConnection, self).__init__( context, istream, ostream, **kwargs ) def close(self): super(NetworkConnection, self).close() self.socket.close() class FileConnection(Connection): def __init__(self, context, in_fn, out_fn, **kwargs): istream = sercore.FileInStream(in_fn) ostream = sercore.FileOutStream(out_fn) super(FileConnection, self).__init__( context, istream, ostream, **kwargs ) def get_connection(context, **kwargs): port = os.getenv("mapreduce.pipes.command.port") if port: return NetworkConnection(context, "localhost", int(port), **kwargs) in_fn = os.getenv("mapreduce.pipes.commandfile") if in_fn: out_fn = "%s.out" % in_fn return FileConnection(context, in_fn, out_fn, **kwargs) raise RuntimeError("no pipes source found") ================================================ FILE: pydoop/mapreduce/pipes.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ Python driver for Hadoop Pipes tasks. The intended usage is to import this module in the executable script passed to ``mapred pipes`` (or ``pydoop submit``) and call ``run_task`` with the appropriate arguments (see the docs and examples for further details). """ import base64 import hashlib import hmac import io import os import struct try: from cPickle import dumps, loads, HIGHEST_PROTOCOL except ImportError: from pickle import dumps, loads, HIGHEST_PROTOCOL from time import time from sys import getsizeof as sizeof import pydoop.config as config import pydoop.sercore as sercore from . import api, connections # py2 compat try: as_text = unicode except NameError: as_text = str PSTATS_DIR = "PYDOOP_PSTATS_DIR" PSTATS_FMT = "PYDOOP_PSTATS_FMT" DEFAULT_PSTATS_FMT = "%s_%05d_%s" # task_type, task_id, random suffix INT_WRITABLE_FMT = ">i" INT_WRITABLE_SIZE = struct.calcsize(INT_WRITABLE_FMT) def create_digest(key, msg): h = hmac.new(key, msg, hashlib.sha1) return base64.b64encode(h.digest()) # extra support for java types, not meant for performance-critical sections def read_int_writable(f): buf = f.read(INT_WRITABLE_SIZE) return struct.unpack(INT_WRITABLE_FMT, buf)[0] def write_int_writable(n, f): f.write(struct.pack(INT_WRITABLE_FMT, n)) def read_bytes_writable(f): length = read_int_writable(f) buf = f.read(length) if len(buf) < length: raise RuntimeError("expected %d bytes, found %d" % (length, len(buf))) return buf def write_bytes_writable(s, f): write_int_writable(len(s), f) if len(s) > 0: f.write(s) class FileSplit(api.FileSplit): @classmethod def frombuffer(cls, buf): filename, offset, length = sercore.deserialize_file_split(buf) return cls(filename, offset, length) class OpaqueSplit(api.OpaqueSplit): @classmethod def frombuffer(cls, buf): return cls.read(io.BytesIO(buf)) @classmethod def read(cls, f): return cls(loads(read_bytes_writable(f))) def write(self, f): write_bytes_writable(dumps(self.payload, HIGHEST_PROTOCOL), f) def write_opaque_splits(splits, f): write_int_writable(len(splits), f) for s in splits: s.write(f) def read_opaque_splits(f): n = read_int_writable(f) return [OpaqueSplit.read(f) for _ in range(n)] class TaskContext(api.Context): JOB_OUTPUT_DIR = "mapreduce.output.fileoutputformat.outputdir" TASK_OUTPUT_DIR = "mapreduce.task.output.dir" TASK_PARTITION = "mapreduce.task.partition" def __init__(self, factory, **kwargs): self.factory = factory self.uplink = None self.combiner = None self.mapper = None self.partitioner = None self.record_reader = None self.record_writer = None self.reducer = None self.nred = None self.progress_value = 0.0 self.last_progress_t = 0.0 self.status = None self.counters = {} self.task_type = None self.avro_key_serializer = None self.avro_value_serializer = None self._private_encoding = kwargs.get("private_encoding", True) self._raw_split = None self._input_split = None self._job_conf = {} self._key = None self._value = None self._values = None self.__auto_serialize = kwargs.get("auto_serialize", True) self.__cache = {} self.__cache_size = 0 self.__spill_size = None # delayed until (if) create_combiner self.__spilling = True # enable actual emit def get_input_split(self, raw=False): if raw: return self._raw_split if not self._input_split: if config.PIPES_EXTERNALSPLITS_URI in self._job_conf: self._input_split = OpaqueSplit.frombuffer(self._raw_split) else: self._input_split = FileSplit.frombuffer(self._raw_split) return self._input_split def get_job_conf(self): return self._job_conf def get_input_key(self): return self._key def get_input_value(self): return self._value def get_input_values(self): return self._values def create_combiner(self): self.combiner = self.factory.create_combiner(self) if self.combiner: self.__spill_size = 1024 * 1024 * self.job_conf.get_int( "mapreduce.task.io.sort.mb", 100 ) self.__spilling = False return self.combiner def create_mapper(self): self.mapper = self.factory.create_mapper(self) return self.mapper def create_partitioner(self): self.partitioner = self.factory.create_partitioner(self) return self.partitioner def create_record_reader(self): self.record_reader = self.factory.create_record_reader(self) return self.record_reader def create_record_writer(self): self.record_writer = self.factory.create_record_writer(self) return self.record_writer def create_reducer(self): self.reducer = self.factory.create_reducer(self) return self.reducer def progress(self): """\ Report progress to the Java side. This needs to flush the uplink stream, but too many flushes can disrupt performance, so we actually talk to upstream once per second. """ now = time() if now - self.last_progress_t > 1: self.last_progress_t = now if self.status: self.uplink.status(self.status) self.status = None self.__spill_counters() self.uplink.progress(self.progress_value) self.uplink.flush() def set_status(self, status): self.status = status self.progress() def get_counter(self, group, name): id = len(self.counters) self.uplink.register_counter(id, group, name) self.uplink.flush() self.counters[id] = 0 return id def increment_counter(self, counter, amount): try: self.counters[counter] += amount except KeyError: raise ValueError("invalid counter: %r" % (counter,)) def __spill_counters(self): for c, amount in self.counters.items(): if amount: self.uplink.increment_counter(c, amount) self.counters[c] = 0 def _authenticate(self, password, digest, challenge): if create_digest(password, challenge) != digest: raise RuntimeError("server failed to authenticate") response_digest = create_digest(password, digest) self.uplink.authenticate(response_digest) self.uplink.flush() def _setup_avro_ser(self): try: from pydoop.avrolib import AvroSerializer except ImportError as e: raise RuntimeError("cannot handle avro output: %s" % e) jc = self.job_conf avro_output = jc.get(config.AVRO_OUTPUT).upper() if avro_output not in api.AVRO_IO_MODES: raise RuntimeError('invalid avro output mode: %s' % avro_output) if avro_output == 'K' or avro_output == 'KV': schema = jc.get(config.AVRO_KEY_OUTPUT_SCHEMA) self.avro_key_serializer = AvroSerializer(schema) if avro_output == 'V' or avro_output == 'KV': schema = jc.get(config.AVRO_VALUE_OUTPUT_SCHEMA) self.avro_value_serializer = AvroSerializer(schema) def __maybe_serialize(self, key, value): if self.task_type == "m" and self._private_encoding: return dumps(key, HIGHEST_PROTOCOL), dumps(value, HIGHEST_PROTOCOL) if self.avro_key_serializer: key = self.avro_key_serializer.serialize(key) elif self.__auto_serialize: key = as_text(key).encode("utf-8") if self.avro_value_serializer: value = self.avro_value_serializer.serialize(value) elif self.__auto_serialize: value = as_text(value).encode("utf-8") return key, value def emit(self, key, value): """\ Handle an output key/value pair. Reporting progress is strictly necessary only when using a Python record writer, because sending an output key/value pair is an implicit progress report. To take advantage of this, however, we would be forced to flush the uplink stream at every output, and that would be too costly. Rather than add a specific timer for this, we just call progress unconditionally and piggyback on its timer instead. Note that when a combiner is caching there is no actual output, so in that case we would need an explicit progress report anyway. """ if self.__spilling: self.__actual_emit(key, value) else: # key must be hashable self.__cache.setdefault(key, []).append(value) self.__cache_size += sizeof(key) + sizeof(value) if self.__cache_size >= self.__spill_size: self.__spill_all() self.progress() def __actual_emit(self, key, value): if self.record_writer: self.record_writer.emit(key, value) return key, value = self.__maybe_serialize(key, value) if self.partitioner: part = self.partitioner.partition(key, self.nred) self.uplink.partitioned_output(part, key, value) else: self.uplink.output(key, value) def __spill_all(self): self.__spilling = True for k in sorted(self.__cache): self._key = k self._values = iter(self.__cache[k]) self.combiner.reduce(self) self.__cache.clear() self.__cache_size = 0 self.__spilling = False def close(self): self.uplink.flush() # do *not* call uplink.done while user components are still active try: if self.mapper: self.mapper.close() # handle combiner after mapper (mapper.close can call emit) if self.__cache: self.__spill_all() self.__spilling = True # re-enable emit for combiner.close self.combiner.close() if self.record_reader: self.record_reader.close() if self.record_writer: self.record_writer.close() if self.reducer: self.reducer.close() self.__spill_counters() finally: self.uplink.done() self.uplink.flush() def get_output_dir(self): return self.job_conf[self.JOB_OUTPUT_DIR] def get_work_path(self): try: return self.job_conf[self.TASK_OUTPUT_DIR] except KeyError: raise RuntimeError("%r not set" % (self.TASK_OUTPUT_DIR,)) def get_task_partition(self): return self.job_conf.get_int(self.TASK_PARTITION) def get_default_work_file(self, extension=""): partition = self.get_task_partition() if partition is None: raise RuntimeError("%r not set" % (self.TASK_PARTITION,)) base = self.job_conf.get("mapreduce.output.basename", "part") return "%s/%s-%s-%05d%s" % ( self.get_work_path(), base, self.task_type, partition, extension ) class Factory(api.Factory): def __init__(self, mapper_class, reducer_class=None, combiner_class=None, partitioner_class=None, record_writer_class=None, record_reader_class=None): self.mclass = mapper_class self.rclass = reducer_class self.cclass = combiner_class self.pclass = partitioner_class self.rwclass = record_writer_class self.rrclass = record_reader_class def create_mapper(self, context): return self.mclass(context) def create_reducer(self, context): return None if not self.rclass else self.rclass(context) def create_combiner(self, context): return None if not self.cclass else self.cclass(context) def create_partitioner(self, context): return None if not self.pclass else self.pclass(context) def create_record_reader(self, context): return None if not self.rrclass else self.rrclass(context) def create_record_writer(self, context): return None if not self.rwclass else self.rwclass(context) def _run(context, **kwargs): with connections.get_connection(context, **kwargs) as connection: for _ in connection.downlink: pass def run_task(factory, **kwargs): """\ Run a MapReduce task. Available keyword arguments: * ``raw_keys`` (default: :obj:`False`): pass map input keys to context as byte strings (ignore any type information) * ``raw_values`` (default: :obj:`False`): pass map input values to context as byte strings (ignore any type information) * ``private_encoding`` (default: :obj:`True`): automatically serialize map output k/v and deserialize reduce input k/v (pickle) * ``auto_serialize`` (default: :obj:`True`): automatically serialize reduce output (map output in map-only jobs) k/v (call str/unicode then encode as utf-8) Advanced keyword arguments: * ``pstats_dir``: run the task with cProfile and store stats in this dir * ``pstats_fmt``: use this pattern for pstats filenames (experts only) The pstats dir and filename pattern can also be provided via ``pydoop submit`` arguments, with lower precedence in case of clashes. """ context = TaskContext(factory, **kwargs) pstats_dir = kwargs.get("pstats_dir", os.getenv(PSTATS_DIR)) if pstats_dir: import cProfile import tempfile import pydoop.hdfs as hdfs hdfs.mkdir(pstats_dir) fd, pstats_fn = tempfile.mkstemp(suffix=".pstats") os.close(fd) cProfile.runctx( "_run(context, **kwargs)", globals(), locals(), filename=pstats_fn ) pstats_fmt = kwargs.get( "pstats_fmt", os.getenv(PSTATS_FMT, DEFAULT_PSTATS_FMT) ) name = pstats_fmt % ( context.task_type, context.get_task_partition(), os.path.basename(pstats_fn) ) hdfs.put(pstats_fn, hdfs.path.join(pstats_dir, name)) else: _run(context, **kwargs) ================================================ FILE: pydoop/test_support.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Miscellaneous utilities for testing. """ from __future__ import print_function import sys import os import tempfile from pydoop.hdfs import default_is_local from pydoop.utils.py3compat import iteritems def __inject_pos(code, start=0): pos = code.find("import", start) if pos < 0: return start pos = code.rfind(os.linesep, 0, pos) + 1 endpos = code.find(os.linesep, pos) + 1 if "__future__" in code[pos:endpos]: return __inject_pos(code, endpos) else: return pos def inject_code(new_code, target_code): """ Inject new_code into target_code, before the first non-future import. NOTE: this is just a hack to make examples work out-of-the-box, in the general case it can fail in several ways. """ new_code = "{0}#--AUTO-INJECTED--{0}{1}{0}#-----------------{0}".format( os.linesep, os.linesep.join(new_code.strip().splitlines()) ) pos = __inject_pos(target_code) return target_code[:pos] + new_code + target_code[pos:] def add_sys_path(target_code): new_code = os.linesep.join([ "import sys", "sys.path = %r" % (sys.path,) ]) return inject_code(new_code, target_code) def set_python_cmd(code, python_cmd=sys.executable): python_cmd = python_cmd.strip() if not python_cmd.startswith(os.sep): python_cmd = os.path.join("", "usr", "bin", "env", python_cmd) if code.startswith("#!"): pos = code.find(os.linesep, 2) code = "" if pos < 0 else code[pos + 1:] return "#!%s%s%s" % (python_cmd, os.linesep, code) def adapt_script(code, python_cmd=sys.executable): return set_python_cmd(add_sys_path(code), python_cmd=python_cmd) def parse_mr_output(output, vtype=str): d = {} for line in output.splitlines(): if line.isspace(): continue try: k, v = line.split() v = vtype(v) except (ValueError, TypeError): raise ValueError("bad output format") if k in d: raise ValueError("duplicate key: %r" % (k,)) d[k] = v return d def compare_counts(c1, c2): if len(c1) != len(c2): print(len(c1), len(c2)) return "number of keys differs" keys = sorted(c1) if sorted(c2) != keys: return "key lists are different" for k in keys: if c1[k] != c2[k]: return "values are different for key %r (%r != %r)" % ( k, c1[k], c2[k] ) class LocalWordCount(object): def __init__(self, input_path, min_occurrence=0, stop_words=None): self.input_path = input_path self.min_occurrence = min_occurrence self.stop_words = frozenset(stop_words or []) self.__expected_output = None @property def expected_output(self): if self.__expected_output is None: self.__expected_output = self.run() return self.__expected_output def run(self): wc = {} if os.path.isdir(self.input_path): for fn in os.listdir(self.input_path): if fn[0] == ".": continue self._wordcount_file(wc, fn, self.input_path) else: self._wordcount_file(wc, self.input_path) if self.min_occurrence: wc = dict(t for t in iteritems(wc) if t[1] >= self.min_occurrence) return wc def _wordcount_file(self, wc, fn, path=None): with open(os.path.join(path, fn) if path else fn) as f: for line in f: for w in line.split(): if w not in self.stop_words: wc[w] = wc.get(w, 0) + 1 def check(self, output): res = compare_counts( parse_mr_output(output, vtype=int), self.expected_output ) if res: return "ERROR: %s" % res else: return "OK." def get_wd_prefix(base="pydoop_"): if default_is_local(): return os.path.join(tempfile.gettempdir(), "pydoop_") else: return base ================================================ FILE: pydoop/test_utils.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Utilities for unit tests. """ import sys import os import random import uuid import tempfile import imp import unittest import shutil import warnings import subprocess import pydoop import pydoop.utils.jvm as jvm from pydoop.utils.py3compat import StringIO JAVA_HOME = jvm.get_java_home() JAVA = os.path.join(JAVA_HOME, "bin", "java") JAVAC = os.path.join(JAVA_HOME, "bin", "javac") _RANDOM_DATA_SIZE = 32 # Default NameNode RPC port. 8020 for all versions except 3.0.0. See # https://issues.apache.org/jira/browse/HDFS-12990 _DEFAULT_HDFS_PORT = 8020 _DEFAULT_BYTES_PER_CHECKSUM = 512 def _get_special_chr(): """ This is used to check unicode support. On some systems, depending on locale settings, we won't be able to use non-ASCII characters when interacting with system calls. Since in such cases it doesn't really make sense to run these tests we set UNI_CHR to a regular ASCII character. """ # something outside the latin-1 range the_chr = u'\N{CYRILLIC CAPITAL LETTER O WITH DIAERESIS}' fd = None fname = None try: fd, fname = tempfile.mkstemp(suffix=the_chr) except UnicodeEncodeError: msg = ("local file system doesn't support unicode characters" "in filenames, falling back to ASCII-only") warnings.warn(msg, UnicodeWarning) the_chr = u's' finally: if fd: os.close(fd) os.remove(fname) return the_chr UNI_CHR = _get_special_chr() _FD_MAP = { "stdout": sys.stdout.fileno(), "stderr": sys.stderr.fileno(), } class FSTree(object): """ >>> t = FSTree('root') >>> d1 = t.add('d1') >>> f1 = t.add('f1', 0) >>> d2 = d1.add('d2') >>> f2 = d2.add('f2', 0) >>> for x in t.walk(): print x.name, x.kind ... root 1 d1 1 d2 1 f2 0 f1 0 """ def __init__(self, name, kind=1): assert kind in (0, 1) # (file, dir) self.name = name self.kind = kind if self.kind: self.children = [] def add(self, name, kind=1): t = FSTree(name, kind) self.children.append(t) return t def walk(self): yield self if self.kind: for c in self.children: for t in c.walk(): yield t def make_wd(fs, prefix="pydoop_test_"): if fs.host: wd = "%s%s" % (prefix, uuid.uuid4().hex) fs.create_directory(wd) return fs.get_path_info(wd)['name'] else: return tempfile.mkdtemp(prefix=prefix) def make_random_data(size=_RANDOM_DATA_SIZE, printable=True): randint = random.randint start, stop = (32, 126) if printable else (0, 255) return bytes(bytearray([randint(start, stop) for _ in range(size)])) def get_bytes_per_checksum(): params = pydoop.hadoop_params() return int(params.get('dfs.bytes-per-checksum', params.get('io.bytes.per.checksum', _DEFAULT_BYTES_PER_CHECKSUM))) def silent_call(func, *args, **kwargs): with open(os.devnull, "w") as dev_null: cache = {} for s in "stdout", "stderr": cache[s] = os.dup(_FD_MAP[s]) os.dup2(dev_null.fileno(), _FD_MAP[s]) try: ret = func(*args, **kwargs) finally: for s in "stdout", "stderr": os.dup2(cache[s], _FD_MAP[s]) return ret def get_module(name, path=None): fp, pathname, description = imp.find_module(name, path) try: module = imp.load_module(name, fp, pathname, description) return module finally: fp.close() def compile_java(java_file, classpath, opts=None): if opts is None: opts = [] java_class_file = os.path.splitext( os.path.realpath(java_file) )[0] + '.class' if (not os.path.exists(java_class_file) or os.path.getmtime(java_file) > os.path.getmtime(java_class_file)): cmd = [JAVAC] + opts if not {"-cp", "-classpath"}.intersection(opts): cmd.extend(["-cp", classpath]) cmd.append(java_file) try: subprocess.check_call(cmd, cwd=os.path.dirname(java_file)) except subprocess.CalledProcessError as e: raise RuntimeError("Error compiling Java file %s\n%s" % ( java_file, e)) def run_java(jclass, classpath, args, wd): try: subprocess.check_call([JAVA, '-cp', classpath, jclass] + args, cwd=wd) except subprocess.CalledProcessError as e: raise RuntimeError("Error running Java class %s\n%s" % ( jclass, e)) def get_java_output_stream(jclass, classpath, args, wd): output = subprocess.check_output( [JAVA, '-cp', classpath, jclass] + args, cwd=wd, stderr=open('/dev/null', 'w')) return StringIO(output) class WDTestCase(unittest.TestCase): def setUp(self): self.wd = tempfile.mkdtemp(prefix='pydoop_test_') def tearDown(self): shutil.rmtree(self.wd) def _mkfn(self, basename): return os.path.join(self.wd, basename) def _mkf(self, basename, mode='w'): return open(self._mkfn(basename), mode) ================================================ FILE: pydoop/utils/__init__.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ General purpose utilities. """ __all__ = [ 'NullHandler', 'NullLogger', 'make_random_str', ] from .misc import NullHandler, NullLogger, make_random_str ================================================ FILE: pydoop/utils/conversion_tables.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT # up-to-date as of Hadoop 2.7.1: http://hadoop.apache.org/docs/r2.7.1/ # hadoop-project-dist/hadoop-common/DeprecatedProperties.html mrv1_to_mrv2 = { 'create.empty.dir.if.nonexist': 'mapreduce.jobcontrol.createdir.ifnotexist', 'dfs.access.time.precision': 'dfs.namenode.accesstime.precision', 'dfs.backup.address': 'dfs.namenode.backup.address', 'dfs.backup.http.address': 'dfs.namenode.backup.http-address', 'dfs.balance.bandwidthPerSec': 'dfs.datanode.balance.bandwidthPerSec', 'dfs.block.size': 'dfs.blocksize', 'dfs.data.dir': 'dfs.datanode.data.dir', 'dfs.datanode.max.xcievers': 'dfs.datanode.max.transfer.threads', 'dfs.df.interval': 'fs.df.interval', 'dfs.federation.nameservice.id': 'dfs.nameservice.id', 'dfs.federation.nameservices': 'dfs.nameservices', 'dfs.http.address': 'dfs.namenode.http-address', 'dfs.https.address': 'dfs.namenode.https-address', 'dfs.https.client.keystore.resource': 'dfs.client.https.keystore.resource', 'dfs.https.need.client.auth': 'dfs.client.https.need-auth', 'dfs.max.objects': 'dfs.namenode.max.objects', 'dfs.max-repl-streams': 'dfs.namenode.replication.max-streams', 'dfs.name.dir': 'dfs.namenode.name.dir', 'dfs.name.dir.restore': 'dfs.namenode.name.dir.restore', 'dfs.name.edits.dir': 'dfs.namenode.edits.dir', 'dfs.permissions': 'dfs.permissions.enabled', 'dfs.permissions.supergroup': 'dfs.permissions.superusergroup', 'dfs.read.prefetch.size': 'dfs.client.read.prefetch.size', 'dfs.replication.considerLoad': 'dfs.namenode.replication.considerLoad', 'dfs.replication.interval': 'dfs.namenode.replication.interval', 'dfs.replication.min': 'dfs.namenode.replication.min', 'dfs.replication.pending.timeout.sec': 'dfs.namenode.replication.pending.timeout-sec', 'dfs.safemode.extension': 'dfs.namenode.safemode.extension', 'dfs.safemode.threshold.pct': 'dfs.namenode.safemode.threshold-pct', 'dfs.secondary.http.address': 'dfs.namenode.secondary.http-address', 'dfs.socket.timeout': 'dfs.client.socket-timeout', 'dfs.umaskmode': 'fs.permissions.umask-mode', 'dfs.write.packet.size': 'dfs.client-write-packet-size', 'fs.checkpoint.dir': 'dfs.namenode.checkpoint.dir', 'fs.checkpoint.edits.dir': 'dfs.namenode.checkpoint.edits.dir', 'fs.checkpoint.period': 'dfs.namenode.checkpoint.period', 'fs.default.name': 'fs.defaultFS', 'hadoop.configured.node.mapping': 'net.topology.configured.node.mapping', 'hadoop.job.history.location': 'mapreduce.jobtracker.jobhistory.location', 'hadoop.native.lib': 'io.native.lib.available', 'hadoop.net.static.resolutions': 'mapreduce.tasktracker.net.static.resolutions', 'hadoop.pipes.command-file.keep': 'mapreduce.pipes.commandfile.preserve', 'hadoop.pipes.executable.interpretor': 'mapreduce.pipes.executable.interpretor', 'hadoop.pipes.executable': 'mapreduce.pipes.executable', 'hadoop.pipes.java.mapper': 'mapreduce.pipes.isjavamapper', 'hadoop.pipes.java.recordreader': 'mapreduce.pipes.isjavarecordreader', 'hadoop.pipes.java.recordwriter': 'mapreduce.pipes.isjavarecordwriter', 'hadoop.pipes.java.reducer': 'mapreduce.pipes.isjavareducer', 'hadoop.pipes.partitioner': 'mapreduce.pipes.partitioner', 'heartbeat.recheck.interval': 'dfs.namenode.heartbeat.recheck-interval', 'io.bytes.per.checksum': 'dfs.bytes-per-checksum', 'io.sort.factor': 'mapreduce.task.io.sort.factor', 'io.sort.mb': 'mapreduce.task.io.sort.mb', 'io.sort.spill.percent': 'mapreduce.map.sort.spill.percent', 'jobclient.completion.poll.interval': 'mapreduce.client.completion.pollinterval', 'jobclient.output.filter': 'mapreduce.client.output.filter', 'jobclient.progress.monitor.poll.interval': 'mapreduce.client.progressmonitor.pollinterval', 'job.end.notification.url': 'mapreduce.job.end-notification.url', 'job.end.retry.attempts': 'mapreduce.job.end-notification.retry.attempts', 'job.end.retry.interval': 'mapreduce.job.end-notification.retry.interval', 'job.local.dir': 'mapreduce.job.local.dir', 'keep.failed.task.files': 'mapreduce.task.files.preserve.failedtasks', 'keep.task.files.pattern': 'mapreduce.task.files.preserve.filepattern', 'key.value.separator.in.input.line': 'mapreduce.input.keyvaluelinerecordreader.key.value.separator', 'local.cache.size': 'mapreduce.tasktracker.cache.local.size', 'map.input.file': 'mapreduce.map.input.file', 'map.input.length': 'mapreduce.map.input.length', 'map.input.start': 'mapreduce.map.input.start', 'map.output.key.field.separator': 'mapreduce.map.output.key.field.separator', 'map.output.key.value.fields.spec': 'mapreduce.fieldsel.map.output.key.value.fields.spec', 'mapred.acls.enabled': 'mapreduce.cluster.acls.enabled', 'mapred.binary.partitioner.left.offset': 'mapreduce.partition.binarypartitioner.left.offset', 'mapred.binary.partitioner.right.offset': 'mapreduce.partition.binarypartitioner.right.offset', 'mapred.cache.archives': 'mapreduce.job.cache.archives', 'mapred.cache.archives.timestamps': 'mapreduce.job.cache.archives.timestamps', 'mapred.cache.files': 'mapreduce.job.cache.files', 'mapred.cache.files.timestamps': 'mapreduce.job.cache.files.timestamps', 'mapred.cache.localArchives': 'mapreduce.job.cache.local.archives', 'mapred.cache.localFiles': 'mapreduce.job.cache.local.files', 'mapred.child.tmp': 'mapreduce.task.tmp.dir', 'mapred.cluster.average.blacklist.threshold': 'mapreduce.jobtracker.blacklist.average.threshold', 'mapred.cluster.map.memory.mb': 'mapreduce.cluster.mapmemory.mb', 'mapred.cluster.max.map.memory.mb': 'mapreduce.jobtracker.maxmapmemory.mb', 'mapred.cluster.max.reduce.memory.mb': 'mapreduce.jobtracker.maxreducememory.mb', 'mapred.cluster.reduce.memory.mb': 'mapreduce.cluster.reducememory.mb', 'mapred.committer.job.setup.cleanup.needed': 'mapreduce.job.committer.setup.cleanup.needed', 'mapred.compress.map.output': 'mapreduce.map.output.compress', 'mapred.data.field.separator': 'mapreduce.fieldsel.data.field.separator', 'mapred.debug.out.lines': 'mapreduce.task.debugout.lines', 'mapred.healthChecker.interval': 'mapreduce.tasktracker.healthchecker.interval', 'mapred.healthChecker.script.args': 'mapreduce.tasktracker.healthchecker.script.args', 'mapred.healthChecker.script.path': 'mapreduce.tasktracker.healthchecker.script.path', 'mapred.healthChecker.script.timeout': 'mapreduce.tasktracker.healthchecker.script.timeout', 'mapred.heartbeats.in.second': 'mapreduce.jobtracker.heartbeats.in.second', 'mapred.hosts.exclude': 'mapreduce.jobtracker.hosts.exclude.filename', 'mapred.hosts': 'mapreduce.jobtracker.hosts.filename', 'mapred.inmem.merge.threshold': 'mapreduce.reduce.merge.inmem.threshold', 'mapred.input.dir.formats': 'mapreduce.input.multipleinputs.dir.formats', 'mapred.input.dir.mappers': 'mapreduce.input.multipleinputs.dir.mappers', 'mapred.input.dir': 'mapreduce.input.fileinputformat.inputdir', 'mapred.input.pathFilter.class': 'mapreduce.input.pathFilter.class', 'mapred.jar': 'mapreduce.job.jar', 'mapred.job.classpath.archives': 'mapreduce.job.classpath.archives', 'mapred.job.classpath.files': 'mapreduce.job.classpath.files', 'mapred.job.id': 'mapreduce.job.id', 'mapred.jobinit.threads': 'mapreduce.jobtracker.jobinit.threads', 'mapred.job.map.memory.mb': 'mapreduce.map.memory.mb', 'mapred.job.name': 'mapreduce.job.name', 'mapred.job.priority': 'mapreduce.job.priority', 'mapred.job.queue.name': 'mapreduce.job.queuename', 'mapred.job.reduce.input.buffer.percent': 'mapreduce.reduce.input.buffer.percent', 'mapred.job.reduce.markreset.buffer.percent': 'mapreduce.reduce.markreset.buffer.percent', 'mapred.job.reduce.memory.mb': 'mapreduce.reduce.memory.mb', 'mapred.job.reduce.total.mem.bytes': 'mapreduce.reduce.memory.totalbytes', 'mapred.job.reuse.jvm.num.tasks': 'mapreduce.job.jvm.numtasks', 'mapred.job.shuffle.input.buffer.percent': 'mapreduce.reduce.shuffle.input.buffer.percent', 'mapred.job.shuffle.merge.percent': 'mapreduce.reduce.shuffle.merge.percent', 'mapred.job.tracker.handler.count': 'mapreduce.jobtracker.handler.count', 'mapred.job.tracker.history.completed.location': 'mapreduce.jobtracker.jobhistory.completed.location', 'mapred.job.tracker.http.address': 'mapreduce.jobtracker.http.address', 'mapred.jobtracker.instrumentation': 'mapreduce.jobtracker.instrumentation', 'mapred.jobtracker.job.history.block.size': 'mapreduce.jobtracker.jobhistory.block.size', 'mapred.job.tracker.jobhistory.lru.cache.size': 'mapreduce.jobtracker.jobhistory.lru.cache.size', 'mapred.job.tracker': 'mapreduce.jobtracker.address', 'mapred.jobtracker.maxtasks.per.job': 'mapreduce.jobtracker.maxtasks.perjob', 'mapred.job.tracker.persist.jobstatus.active': 'mapreduce.jobtracker.persist.jobstatus.active', 'mapred.job.tracker.persist.jobstatus.dir': 'mapreduce.jobtracker.persist.jobstatus.dir', 'mapred.job.tracker.persist.jobstatus.hours': 'mapreduce.jobtracker.persist.jobstatus.hours', 'mapred.jobtracker.restart.recover': 'mapreduce.jobtracker.restart.recover', 'mapred.job.tracker.retiredjobs.cache.size': 'mapreduce.jobtracker.retiredjobs.cache.size', 'mapred.job.tracker.retire.jobs': 'mapreduce.jobtracker.retirejobs', 'mapred.jobtracker.taskalloc.capacitypad': 'mapreduce.jobtracker.taskscheduler.taskalloc.capacitypad', 'mapred.jobtracker.taskScheduler': 'mapreduce.jobtracker.taskscheduler', 'mapred.jobtracker.taskScheduler.maxRunningTasksPerJob': 'mapreduce.jobtracker.taskscheduler.maxrunningtasks.perjob', 'mapred.join.expr': 'mapreduce.join.expr', 'mapred.join.keycomparator': 'mapreduce.join.keycomparator', 'mapred.lazy.output.format': 'mapreduce.output.lazyoutputformat.outputformat', 'mapred.line.input.format.linespermap': 'mapreduce.input.lineinputformat.linespermap', 'mapred.linerecordreader.maxlength': 'mapreduce.input.linerecordreader.line.maxlength', 'mapred.local.dir': 'mapreduce.cluster.local.dir', 'mapred.local.dir.minspacekill': 'mapreduce.tasktracker.local.dir.minspacekill', 'mapred.local.dir.minspacestart': 'mapreduce.tasktracker.local.dir.minspacestart', 'mapred.map.child.env': 'mapreduce.map.env', 'mapred.map.child.java.opts': 'mapreduce.map.java.opts', 'mapred.map.child.log.level': 'mapreduce.map.log.level', 'mapred.map.max.attempts': 'mapreduce.map.maxattempts', 'mapred.map.output.compression.codec': 'mapreduce.map.output.compress.codec', 'mapred.mapoutput.key.class': 'mapreduce.map.output.key.class', 'mapred.mapoutput.value.class': 'mapreduce.map.output.value.class', 'mapred.mapper.regex.group': 'mapreduce.mapper.regexmapper..group', 'mapred.mapper.regex': 'mapreduce.mapper.regex', 'mapred.map.task.debug.script': 'mapreduce.map.debug.script', 'mapred.map.tasks': 'mapreduce.job.maps', 'mapred.map.tasks.speculative.execution': 'mapreduce.map.speculative', 'mapred.max.map.failures.percent': 'mapreduce.map.failures.maxpercent', 'mapred.max.reduce.failures.percent': 'mapreduce.reduce.failures.maxpercent', 'mapred.max.split.size': 'mapreduce.input.fileinputformat.split.maxsize', 'mapred.max.tracker.blacklists': 'mapreduce.jobtracker.tasktracker.maxblacklists', 'mapred.max.tracker.failures': 'mapreduce.job.maxtaskfailures.per.tracker', 'mapred.merge.recordsBeforeProgress': 'mapreduce.task.merge.progress.records', 'mapred.min.split.size': 'mapreduce.input.fileinputformat.split.minsize', 'mapred.min.split.size.per.node': 'mapreduce.input.fileinputformat.split.minsize.per.node', 'mapred.min.split.size.per.rack': 'mapreduce.input.fileinputformat.split.minsize.per.rack', 'mapred.output.compression.codec': 'mapreduce.output.fileoutputformat.compress.codec', 'mapred.output.compression.type': 'mapreduce.output.fileoutputformat.compress.type', 'mapred.output.compress': 'mapreduce.output.fileoutputformat.compress', 'mapred.output.dir': 'mapreduce.output.fileoutputformat.outputdir', 'mapred.output.key.class': 'mapreduce.job.output.key.class', 'mapred.output.key.comparator.class': 'mapreduce.job.output.key.comparator.class', 'mapred.output.value.class': 'mapreduce.job.output.value.class', 'mapred.output.value.groupfn.class': 'mapreduce.job.output.group.comparator.class', 'mapred.permissions.supergroup': 'mapreduce.cluster.permissions.supergroup', 'mapred.pipes.user.inputformat': 'mapreduce.pipes.inputformat', 'mapred.reduce.child.env': 'mapreduce.reduce.env', 'mapred.reduce.child.java.opts': 'mapreduce.reduce.java.opts', 'mapred.reduce.child.log.level': 'mapreduce.reduce.log.level', 'mapred.reduce.max.attempts': 'mapreduce.reduce.maxattempts', 'mapred.reduce.parallel.copies': 'mapreduce.reduce.shuffle.parallelcopies', 'mapred.reduce.slowstart.completed.maps': 'mapreduce.job.reduce.slowstart.completedmaps', 'mapred.reduce.task.debug.script': 'mapreduce.reduce.debug.script', 'mapred.reduce.tasks': 'mapreduce.job.reduces', 'mapred.reduce.tasks.speculative.execution': 'mapreduce.reduce.speculative', 'mapred.seqbinary.output.key.class': 'mapreduce.output.seqbinaryoutputformat.key.class', 'mapred.seqbinary.output.value.class': 'mapreduce.output.seqbinaryoutputformat.value.class', 'mapred.shuffle.connect.timeout': 'mapreduce.reduce.shuffle.connect.timeout', 'mapred.shuffle.read.timeout': 'mapreduce.reduce.shuffle.read.timeout', 'mapred.skip.attempts.to.start.skipping': 'mapreduce.task.skip.start.attempts', 'mapred.skip.map.auto.incr.proc.count': 'mapreduce.map.skip.proc-count.auto-incr', 'mapred.skip.map.max.skip.records': 'mapreduce.map.skip.maxrecords', 'mapred.skip.on': 'mapreduce.job.skiprecords', 'mapred.skip.out.dir': 'mapreduce.job.skip.outdir', 'mapred.skip.reduce.auto.incr.proc.count': 'mapreduce.reduce.skip.proc-count.auto-incr', 'mapred.skip.reduce.max.skip.groups': 'mapreduce.reduce.skip.maxgroups', 'mapred.speculative.execution.slowNodeThreshold': 'mapreduce.job.speculative.slownodethreshold', 'mapred.speculative.execution.slowTaskThreshold': 'mapreduce.job.speculative.slowtaskthreshold', 'mapred.speculative.execution.speculativeCap': 'mapreduce.job.speculative.speculativecap', 'mapred.submit.replication': 'mapreduce.client.submit.file.replication', 'mapred.system.dir': 'mapreduce.jobtracker.system.dir', 'mapred.task.cache.levels': 'mapreduce.jobtracker.taskcache.levels', 'mapred.task.id': 'mapreduce.task.attempt.id', 'mapred.task.is.map': 'mapreduce.task.ismap', 'mapred.task.partition': 'mapreduce.task.partition', 'mapred.task.profile': 'mapreduce.task.profile', 'mapred.task.profile.maps': 'mapreduce.task.profile.maps', 'mapred.task.profile.params': 'mapreduce.task.profile.params', 'mapred.task.profile.reduces': 'mapreduce.task.profile.reduces', 'mapred.task.timeout': 'mapreduce.task.timeout', 'mapred.tasktracker.dns.interface': 'mapreduce.tasktracker.dns.interface', 'mapred.tasktracker.dns.nameserver': 'mapreduce.tasktracker.dns.nameserver', 'mapred.tasktracker.events.batchsize': 'mapreduce.tasktracker.events.batchsize', 'mapred.tasktracker.expiry.interval': 'mapreduce.jobtracker.expire.trackers.interval', 'mapred.task.tracker.http.address': 'mapreduce.tasktracker.http.address', 'mapred.tasktracker.indexcache.mb': 'mapreduce.tasktracker.indexcache.mb', 'mapred.tasktracker.instrumentation': 'mapreduce.tasktracker.instrumentation', 'mapred.tasktracker.map.tasks.maximum': 'mapreduce.tasktracker.map.tasks.maximum', 'mapred.tasktracker.memory_calculator_plugin': 'mapreduce.tasktracker.resourcecalculatorplugin', 'mapred.tasktracker.memorycalculatorplugin': 'mapreduce.tasktracker.resourcecalculatorplugin', 'mapred.tasktracker.reduce.tasks.maximum': 'mapreduce.tasktracker.reduce.tasks.maximum', 'mapred.task.tracker.report.address': 'mapreduce.tasktracker.report.address', 'mapred.task.tracker.task-controller': 'mapreduce.tasktracker.taskcontroller', 'mapred.tasktracker.taskmemorymanager.monitoring-interval': 'mapreduce.tasktracker.taskmemorymanager.monitoringinterval', 'mapred.tasktracker.tasks.sleeptime-before-sigkill': 'mapreduce.tasktracker.tasks.sleeptimebeforesigkill', 'mapred.temp.dir': 'mapreduce.cluster.temp.dir', 'mapred.text.key.comparator.options': 'mapreduce.partition.keycomparator.options', 'mapred.text.key.partitioner.options': 'mapreduce.partition.keypartitioner.options', 'mapred.textoutputformat.separator': 'mapreduce.output.textoutputformat.separator', 'mapred.tip.id': 'mapreduce.task.id', 'mapreduce.combine.class': 'mapreduce.job.combine.class', 'mapreduce.inputformat.class': 'mapreduce.job.inputformat.class', 'mapreduce.job.counters.limit': 'mapreduce.job.counters.max', 'mapreduce.jobtracker.permissions.supergroup': 'mapreduce.cluster.permissions.supergroup', 'mapreduce.map.class': 'mapreduce.job.map.class', 'mapreduce.outputformat.class': 'mapreduce.job.outputformat.class', 'mapreduce.partitioner.class': 'mapreduce.job.partitioner.class', 'mapreduce.reduce.class': 'mapreduce.job.reduce.class', 'mapred.used.genericoptionsparser': 'mapreduce.client.genericoptionsparser.used', 'mapred.userlog.limit.kb': 'mapreduce.task.userlog.limit.kb', 'mapred.userlog.retain.hours': 'mapreduce.job.userlog.retain.hours', 'mapred.working.dir': 'mapreduce.job.working.dir', 'mapred.work.output.dir': 'mapreduce.task.output.dir', 'min.num.spills.for.combine': 'mapreduce.map.combine.minspills', 'reduce.output.key.value.fields.spec': 'mapreduce.fieldsel.reduce.output.key.value.fields.spec', 'security.job.submission.protocol.acl': 'security.job.client.protocol.acl', 'security.task.umbilical.protocol.acl': 'security.job.task.protocol.acl', 'sequencefile.filter.class': 'mapreduce.input.sequencefileinputfilter.class', 'sequencefile.filter.frequency': 'mapreduce.input.sequencefileinputfilter.frequency', 'sequencefile.filter.regex': 'mapreduce.input.sequencefileinputfilter.regex', 'session.id': 'dfs.metrics.session-id', # duplicate key :-o # 'slave.host.name': # 'dfs.datanode.hostname', 'slave.host.name': 'mapreduce.tasktracker.host.name', 'tasktracker.contention.tracking': 'mapreduce.tasktracker.contention.tracking', 'tasktracker.http.threads': 'mapreduce.tasktracker.http.threads', 'topology.node.switch.mapping.impl': 'net.topology.node.switch.mapping.impl', 'topology.script.file.name': 'net.topology.script.file.name', 'topology.script.number.args': 'net.topology.script.number.args', 'user.name': 'mapreduce.job.user.name', 'webinterface.private.actions': 'mapreduce.jobtracker.webinterface.trusted', ('yarn.app.mapreduce.yarn.app.mapreduce.' 'client-am.ipc.max-retries-on-timeouts'): 'yarn.app.mapreduce.client-am.ipc.max-retries-on-timeouts', } mrv2_to_mrv1 = dict((t[1], t[0]) for t in mrv1_to_mrv2.items()) ================================================ FILE: pydoop/utils/jvm.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import os import shutil import string import subprocess import sys import tempfile import fnmatch JPROG = string.Template("""\ public class ${classname} { public static void main(String[] args) { System.out.println(System.getProperty("java.home")); } } """) def get_java_home(): """\ Try getting JAVA_HOME from system properties. We are interested in the JDK home, containing include/jni.h, while the java.home property points to the JRE home. If a JDK is installed, however, the two are (usually) related: the JDK home is either the same directory as the JRE home (recent java versions) or its parent (and java.home points to jdk_home/jre). """ error = RuntimeError("java home not found, try setting JAVA_HOME") try: return os.environ["JAVA_HOME"] except KeyError: wd = tempfile.mkdtemp(prefix='pydoop_') jclass = "Temp" jsrc = os.path.join(wd, "%s.java" % jclass) with open(jsrc, "w") as f: f.write(JPROG.substitute(classname=jclass)) try: subprocess.check_call(["javac", jsrc]) path = subprocess.check_output( ["java", "-cp", wd, jclass], universal_newlines=True ) except (OSError, UnicodeDecodeError, subprocess.CalledProcessError): raise error finally: shutil.rmtree(wd) path = os.path.normpath(path.strip()) if os.path.exists(os.path.join(path, "include", "jni.h")): return path path = os.path.dirname(path) if os.path.exists(os.path.join(path, "include", "jni.h")): return path raise error def load_jvm_lib(java_home=None): if not java_home: java_home = get_java_home() jvm_path, jvm_lib = get_jvm_lib_path_and_name(java_home) if jvm_path and jvm_lib: from ctypes import CDLL CDLL(os.path.join(jvm_path, jvm_lib)) else: raise ImportError("Unable to load the JVM dynamic library") def get_include_dirs(): java_home = get_java_home() dirs = [os.path.join(java_home, 'include'), os.path.join('native', 'jni_include'), os.path.join(java_home, 'lib')] if sys.platform == 'win32': dirs += [os.path.join(java_home, 'include', 'win32')] elif sys.platform == 'darwin': dirs += [os.path.join(java_home, 'include', 'darwin')] elif sys.platform.startswith('freebsd'): dirs += [os.path.join(java_home, 'include', 'freebsd')] else: # linux dirs += [os.path.join(java_home, 'include', 'linux')] return dirs def get_libraries(): libraries = [] if sys.platform == 'win32': libraries += ['Advapi32'] elif sys.platform == 'darwin': libraries += ['dl', 'jvm'] elif sys.platform.startswith('freebsd'): libraries += ['jvm'] else: # linux etc. libraries += ['dl', "jvm"] return libraries def get_macros(): macros = [] if sys.platform == 'win32': macros += [('WIN32', 1)] elif sys.platform == 'darwin': macros += [('MACOSX', 1)] else: # linux etc. pass return macros def get_jvm_lib_path_and_name(java_home=None): if not java_home: java_home = get_java_home() jvm_lib_name = None if sys.platform == 'win32': jvm_lib_name = "jvm.dll" # FIXME: check the library name elif sys.platform == 'darwin': jvm_lib_name = "libjvm.dylib" else: # linux jvm_lib_name = "libjvm.so" jvm_path = find_file(java_home, jvm_lib_name) return os.path.dirname(jvm_path), jvm_lib_name if jvm_path else None def check_jni_header(include_dirs=None): for d in include_dirs: if os.path.exists(os.path.join(d, 'jni.h')): found_jni = True break if not found_jni: import warnings warnings.warn('Falling back to provided JNI headers: ' + 'unable to find jni.h in your JAVA_HOME') def find_file(path, to_find): result = None for element in os.listdir(path): if result: break if fnmatch.fnmatch(element, to_find): fullPath = os.path.join(path, element) result = fullPath if not result and os.path.isdir(os.path.join(path, element)): result = find_file(os.path.join(path, element), to_find) return result ================================================ FILE: pydoop/utils/misc.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Miscellaneous utilities. """ import logging import time import uuid DEFAULT_LOG_LEVEL = "WARNING" class NullHandler(logging.Handler): def emit(self, record): pass class NullLogger(logging.Logger): def __init__(self): logging.Logger.__init__(self, "null") self.propagate = 0 self.handlers = [NullHandler()] def make_random_str(prefix="pydoop_", postfix=''): return "%s%s%s" % (prefix, uuid.uuid4().hex, postfix) class Timer(object): def __init__(self, ctx, counter_group=None): self.ctx = ctx self._start_times = {} self._counters = {} self._counter_group = counter_group if counter_group else "Timer" def _gen_counter_name(self, event): return "TIME_" + event.upper() + " (ms)" def _get_time_counter(self, name): if name not in self._counters: counter_name = self._gen_counter_name(name) self._counters[name] = self.ctx.get_counter( self._counter_group, counter_name ) return self._counters[name] def start(self, s): self._start_times[s] = time.time() def stop(self, s): delta_ms = 1000 * (time.time() - self._start_times[s]) self.ctx.increment_counter(self._get_time_counter(s), int(delta_ms)) def time_block(self, event_name): return self.TimingBlock(self, event_name) class TimingBlock(object): def __init__(self, timer, event_name): self._timer = timer self._event_name = event_name def __enter__(self): self._timer.start(self._event_name) return self._timer def __exit__(self, exception_type, exception_val, exception_tb): self._timer.stop(self._event_name) return False ================================================ FILE: pydoop/utils/py3compat.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys from abc import ABCMeta _is_py3 = sys.version_info[0] == 3 __all__ = [ "ABC", "basestring", "bintype", "cfilter", "clong", "cmap", "configparser", "czip", "iteritems", "parser_read", "pickle", "socketserver", "StringIO", "unicode", "xchr", ] class Py2ABC(object): __metaclass__ = ABCMeta def __identity(x): return x def __chr(x): return chr(x) def __iteritems_2(x): return x.iteritems() def __iteritems_3(x): return x.items() def __parser_read_2(parser, f): parser.readfp(f) def __parser_read_3(parser, f): parser.read_file(f) if _is_py3: from io import BytesIO as StringIO from abc import ABC import configparser import pickle import socketserver clong = int # something that should be interpreted as a string basestring = str unicode = str parser_read = __parser_read_3 xchr = __identity czip = zip cmap = map cfilter = filter iteritems = __iteritems_3 bintype = bytes else: from itertools import izip as czip from itertools import imap as cmap from itertools import ifilter as cfilter from cStringIO import StringIO import cPickle as pickle import ConfigParser as configparser import SocketServer as socketserver parser_read = __parser_read_2 # something that should be interpreted as a string basestring = unicode unicode = unicode clong = long # noqa: F821 xchr = __chr iteritems = __iteritems_2 bintype = str ABC = Py2ABC ================================================ FILE: pydoop.properties ================================================ AVRO_INPUT=pydoop.mapreduce.avro.input AVRO_OUTPUT=pydoop.mapreduce.avro.output AVRO_KEY_INPUT_SCHEMA=pydoop.mapreduce.avro.key.input.schema AVRO_KEY_OUTPUT_SCHEMA=pydoop.mapreduce.avro.key.output.schema AVRO_VALUE_INPUT_SCHEMA=pydoop.mapreduce.avro.value.input.schema AVRO_VALUE_OUTPUT_SCHEMA=pydoop.mapreduce.avro.value.output.schema PIPES_EXTERNALSPLITS_URI=pydoop.mapreduce.pipes.externalsplits.uri ================================================ FILE: requirements.txt ================================================ avro >=1.7.4; python_version < '3' avro-python3 >=1.7.4; python_version >= '3' setuptools # examples wheel ================================================ FILE: setup.cfg ================================================ [flake8] ignore = E402,W504 exclude = hadoop*,build ================================================ FILE: setup.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ Pydoop is a Python MapReduce and HDFS API for Hadoop. Pydoop is built on top of two C/C++ extension modules: a libhdfs wrapper and a (de)serialization library for types used by the Hadoop Pipes protocol. Since libhdfs is, in turn, a JNI wrapper for the HDFS Java code, Pydoop needs a JDK (a JRE is not enough) to build. You can point Pydoop to the Java home directory by exporting the JAVA_HOME environment variable. Make sure JAVA_HOME points to the JDK home directory (e.g., ${JAVA_HOME}/include/jni.h should be a valid path). If JAVA_HOME is not defined, Pydoop will try to get the JDK home from Java system properties. To compile its Java components, Pydoop also needs to find the Hadoop libraries. In order to do so, it will try to call ``hadoop classpath``, so make sure that the ``hadoop`` executable is in the PATH. """ from __future__ import print_function import sys import time import os import glob import shutil import itertools import tempfile SETUPTOOLS_MIN_VER = '3.3' import setuptools from pkg_resources import parse_version # included in setuptools print('using setuptools version', setuptools.__version__) if parse_version(setuptools.__version__) < parse_version(SETUPTOOLS_MIN_VER): raise RuntimeError( 'setuptools minimum required version: %s' % SETUPTOOLS_MIN_VER ) # bug: http://bugs.python.org/issue1222585 # workaround: http://stackoverflow.com/questions/8106258 from distutils.sysconfig import get_config_var _UNWANTED_OPTS = frozenset(['-Wstrict-prototypes']) os.environ['OPT'] = ' '.join( _ for _ in get_config_var('OPT').strip().split() if _ not in _UNWANTED_OPTS ) from setuptools import setup, find_packages, Extension from setuptools.command.build_ext import build_ext from distutils.command.build import build from distutils.errors import DistutilsSetupError, CompileError from distutils import log import pydoop import pydoop.utils.jvm as jvm VERSION_FN = "VERSION" EXTRA_COMPILE_ARGS = ["-Wno-write-strings"] # http://bugs.python.org/issue6952 # properties file. Since the source is in the root dir, filename = basename PROP_FN = PROP_BN = pydoop.__propfile_basename__ CONSOLE_SCRIPTS = ['pydoop = pydoop.app.main:main'] if sys.version_info[0] == 3: CONSOLE_SCRIPTS.append('pydoop3 = pydoop.app.main:main') else: CONSOLE_SCRIPTS.append('pydoop2 = pydoop.app.main:main') # --------- # UTILITIES # --------- def rm_rf(path, dry_run=False): """ Remove a file or directory tree. Won't throw an exception, even if the removal fails. """ log.info("removing %s" % path) if dry_run: return try: if os.path.isdir(path) and not os.path.islink(path): shutil.rmtree(path) else: os.remove(path) except OSError: pass def mtime(fn): return os.stat(fn).st_mtime def must_generate(target, prerequisites): try: return max(mtime(p) for p in prerequisites) > mtime(target) except OSError: return True def get_version_string(): try: with open(VERSION_FN) as f: return f.read().strip() except IOError: raise DistutilsSetupError("failed to read version info") def write_config(filename="pydoop/config.py"): prereq = PROP_FN if must_generate(filename, [prereq]): props = pydoop.read_properties(PROP_FN) with open(filename, "w") as fo: fo.write("# GENERATED BY setup.py\n") for k in sorted(props): fo.write("%s = %r\n" % (k, props[k])) def write_version(filename="pydoop/version.py"): if must_generate(filename, [VERSION_FN]): with open(filename, "w") as f: f.write("# GENERATED BY setup.py\n") f.write("version = %r\n" % (get_version_string(),)) EXTENSION_MODULES = [ Extension( 'pydoop.native_core_hdfs', include_dirs=[ 'src/libhdfs', 'src/libhdfs/include', 'src/libhdfs/os/posix', ], sources=list(itertools.chain( glob.iglob('src/libhdfs/*.c'), glob.iglob('src/libhdfs/common/*.c'), glob.iglob('src/libhdfs/os/posix/*.c'), glob.iglob('src/native_core_hdfs/*.cc') )), extra_compile_args=EXTRA_COMPILE_ARGS, # to be finalized at build time ), Extension( 'pydoop.sercore', sources=[ "src/sercore/hu_extras.cpp", "src/sercore/sercore.cpp", "src/sercore/streams.cpp", "src/sercore/HadoopUtils/SerialUtils.cc", ], include_dirs=["src/sercore/HadoopUtils"], extra_compile_args=EXTRA_COMPILE_ARGS + ["-std=c++11", "-O3"], ) ] # ------------ # BUILD ENGINE # ------------ class JavaLib(object): def __init__(self): self.jar_name = pydoop.jar_name() self.classpath = pydoop.hadoop_classpath() self.java_files = glob.glob( "src/it/crs4/pydoop/mapreduce/pipes/*.java" ) + ["src/it/crs4/pydoop/NoSeparatorTextOutputFormat.java"] self.dependencies = glob.glob('lib/*.jar') self.properties = [( os.path.join("it/crs4/pydoop/mapreduce/pipes", PROP_BN), PROP_FN )] class JavaBuilder(object): def __init__(self, build_temp, build_lib): self.build_temp = build_temp self.build_lib = build_lib self.java_libs = [JavaLib()] def run(self): for jlib in self.java_libs: self.__build_java_lib(jlib) def __build_java_lib(self, jlib): package_path = os.path.join(self.build_lib, "pydoop") compile_cmd = "javac" if jlib.classpath: classpath = [jlib.classpath] for src in jlib.dependencies: dest = os.path.join(package_path, os.path.basename(src)) shutil.copyfile(src, dest) classpath.append(dest) compile_cmd += " -classpath %s" % (':'.join(classpath)) else: log.warn( "WARNING: could not set classpath, java code may not compile" ) class_dir = os.path.join( self.build_temp, "pipes" ) jar_path = os.path.join(package_path, jlib.jar_name) if not os.path.exists(class_dir): os.mkdir(class_dir) compile_cmd += " -d '%s'" % class_dir log.info("Compiling Java classes") for f in jlib.java_files: compile_cmd += " %s" % f ret = os.system(compile_cmd) if ret: raise DistutilsSetupError( "Error compiling java component. Command: %s" % compile_cmd ) log.info("Copying properties file") for p in jlib.properties: prop_file_dest = os.path.join(class_dir, p[0]) shutil.copyfile(p[1], prop_file_dest) log.info("Making Jar: %s", jar_path) package_cmd = "jar -cf %(jar_path)s -C %(class_dir)s ./it" % { 'jar_path': jar_path, 'class_dir': class_dir } log.info("Packaging Java classes") log.info("Command: %s", package_cmd) ret = os.system(package_cmd) if ret: raise DistutilsSetupError( "Error packaging java component. Command: %s" % package_cmd ) class BuildPydoopExt(build_ext): def __have_better_tls(self): log.info("checking for TLS support") test_code = "int main(void) { static __thread int i = 0; return i; }" wd = tempfile.mkdtemp(prefix="pydoop_") test_src = os.path.join(wd, "temp.c") with open(test_src, "w") as f: f.write(test_code) try: self.compiler.compile([test_src], output_dir=wd) except CompileError: ret = False else: ret = True shutil.rmtree(wd) return ret def __finalize_hdfs(self, ext): """\ Adds a few bits that depend on the specific environment. Delaying this until the build_ext phase allows non-build commands (e.g., sdist) to be run without java. """ java_home = jvm.get_java_home() jvm_lib_path, _ = jvm.get_jvm_lib_path_and_name(java_home) ext.include_dirs = jvm.get_include_dirs() + ext.include_dirs ext.libraries = jvm.get_libraries() ext.library_dirs = [os.path.join(java_home, "Libraries"), jvm_lib_path] ext.define_macros = jvm.get_macros() ext.extra_link_args = ['-Wl,-rpath,%s' % jvm_lib_path] if self.__have_better_tls(): ext.define_macros.append(("HAVE_BETTER_TLS", None)) try: # too many warnings in libhdfs self.compiler.compiler_so.remove("-Wsign-compare") except (AttributeError, ValueError): pass # called for each extension, after compiler has been set up def build_extension(self, ext): if ext.name == "pydoop.native_core_hdfs": self.__finalize_hdfs(ext) build_ext.build_extension(self, ext) class BuildPydoop(build): def build_java(self): jb = JavaBuilder(self.build_temp, self.build_lib) jb.run() def create_tmp(self): if not os.path.exists(self.build_temp): os.mkdir(self.build_temp) if not os.path.exists(self.build_lib): os.mkdir(self.build_lib) def clean_up(self): shutil.rmtree(self.build_temp) def run(self): write_version() write_config() shutil.copyfile(PROP_FN, os.path.join("pydoop", PROP_BN)) build.run(self) try: self.create_tmp() self.build_java() finally: # On NFS, if we clean up right away we have issues with # NFS handles being still in the directory trees to be # deleted. So, we sleep a bit and then delete time.sleep(0.5) self.clean_up() log.info("Build finished") setup( name="pydoop", version=get_version_string(), description=pydoop.__doc__.strip().splitlines()[0], long_description=pydoop.__doc__.lstrip(), author=pydoop.__author__, author_email=pydoop.__author_email__, url=pydoop.__url__, download_url="https://pypi.python.org/pypi/pydoop", install_requires=['setuptools>=%s' % SETUPTOOLS_MIN_VER], extras_require={ 'avro': [ 'avro>=1.7.4;python_version<"3"', 'avro-python3>=1.7.4;python_version>="3"', ], }, packages=find_packages(exclude=['test', 'test.*']), package_data={"pydoop": [PROP_FN]}, cmdclass={ "build": BuildPydoop, "build_ext": BuildPydoopExt, }, entry_points={'console_scripts': CONSOLE_SCRIPTS}, platforms=["Linux"], ext_modules=EXTENSION_MODULES, license="Apache-2.0", keywords=["hadoop", "mapreduce"], classifiers=[ "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3.5", "License :: OSI Approved :: Apache Software License", "Operating System :: POSIX :: Linux", "Topic :: Software Development :: Libraries :: Application Frameworks", "Intended Audience :: Developers", ], data_files=[ ('config', ['README.md']), ], zip_safe=False, ) ================================================ FILE: src/Py_macros.h ================================================ /* BEGIN_COPYRIGHT * * Copyright 2009-2026 CRS4. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * END_COPYRIGHT */ #ifndef PYDOOP_PY_MACROS #define PYDOOP_PY_MACROS 1 // FIXME: PyBytes should be ok in py2.7 too. #if IS_PY3K #define PyInt_Check PyLong_Check #define PyInt_AsLong PyLong_AsLong #define PyInt_AsSsize_t PyLong_AsSsize_t #define PyString_Check PyBytes_Check #define PyString_AsString PyBytes_AsString #else #endif #endif // PYDOOP_PY_MACROS ================================================ FILE: src/buf_macros.h ================================================ /* BEGIN_COPYRIGHT * * Copyright 2009-2026 CRS4. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * END_COPYRIGHT */ #ifndef PYDOOP_BUF_MACROS #define PYDOOP_BUF_MACROS 1 #if IS_PY3K #define _PyBuf_FromStringAndSize(s,nbytes) PyBytes_FromStringAndSize(s, nbytes) #define _PyBuf_AS_STRING(b) PyBytes_AS_STRING(b) #define _PyBuf_Resize(b, n) _PyBytes_Resize(b, n) #define _PyBuf_FromString(x) PyBytes_FromString(x) #else #define _PyBuf_FromStringAndSize(s,nbytes) PyString_FromStringAndSize(s, nbytes) #define _PyBuf_AS_STRING(b) PyString_AS_STRING(b) #define _PyBuf_Resize(b, n) _PyString_Resize(b, n) #define _PyBuf_FromString(x) PyString_FromString(x) #endif #endif /* PYDOOP_BUF_MACROS */ ================================================ FILE: src/it/crs4/pydoop/NoSeparatorTextOutputFormat.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop; import java.io.DataOutputStream; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.*; import org.apache.hadoop.util.Progressable; /** * A TextOutputFormat that doesn't insert a separator between key and value. */ public class NoSeparatorTextOutputFormat extends TextOutputFormat { public RecordWriter getRecordWriter(TaskAttemptContext job ) throws IOException, InterruptedException { final String keyValueSeparator = ""; Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); } Path file = getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter(fileOut, keyValueSeparator); } else { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter( new DataOutputStream (codec.createOutputStream(fileOut)), keyValueSeparator); } } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/Application.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.File; import java.io.IOException; import java.net.ServerSocket; import java.net.Socket; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; import javax.crypto.SecretKey; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.conf.Configuration; /* FIXME org.apache.hadoop.mapred.TaskLog is clearly not what it is expected to be used with org.apache.hadoop.mapreduce.* For the time being, we use the following as a stand-in. it.crs4.pydoop.mapreduce.pipes.TaskLog; */ import org.apache.hadoop.mapreduce.TaskInputOutputContext; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.TaskID; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.filecache.DistributedCache; import org.apache.hadoop.mapreduce.security.SecureShuffleUtils; import org.apache.hadoop.mapreduce.security.TokenCache; import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier; import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.StringUtils; /** * This class is responsible for launching and communicating with the child * process. */ class Application { private static final Log LOG = LogFactory.getLog(Application.class.getName()); private ServerSocket serverSocket; private Process process; private Socket clientSocket; private OutputHandler handler; private DownwardProtocol downlink; static final boolean WINDOWS = System.getProperty("os.name").startsWith("Windows"); /** * Start the child process to handle the task for us. * @throws IOException * @throws InterruptedException */ Application(TaskInputOutputContext context, DummyRecordReader input) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); OutputCommitter committer = context.getOutputCommitter(); if (committer instanceof FileOutputCommitter) { conf.set(MRJobConfig.TASK_OUTPUT_DIR, ((FileOutputCommitter)committer).getWorkPath().toString()); } serverSocket = new ServerSocket(0); Map env = new HashMap(); // add TMPDIR environment variable with the value of java.io.tmpdir env.put("TMPDIR", System.getProperty("java.io.tmpdir")); env.put(Submitter.PORT, Integer.toString(serverSocket.getLocalPort())); //Add token to the environment if security is enabled Token jobToken = TokenCache.getJobToken(context.getCredentials()); // This password is used as shared secret key between this application and // child pipes process byte[] password = jobToken.getPassword(); String localPasswordFile = new File( System.getProperty("user.dir"), "jobTokenPassword" ).getAbsolutePath(); writePasswordToLocalFile(localPasswordFile, password, conf); // FIXME why is this not Submitter.SECRET_LOCATION ? env.put("hadoop.pipes.shared.secret.location", localPasswordFile); List cmd = new ArrayList(); String interpretor = conf.get(Submitter.INTERPRETOR); if (interpretor != null) { cmd.add(interpretor); } String executable = context.getLocalCacheFiles()[0].toString(); if (!(new File(executable).canExecute())) { // LinuxTaskController sets +x permissions on all distcache files already. // In case of DefaultTaskController, set permissions here. FileUtil.chmod(executable, "u+x"); } cmd.add(executable); // wrap the command in a stdout/stderr capture // we are starting map/reduce task of the pipes job. this is not a cleanup // attempt. TaskAttemptID taskid = context.getTaskAttemptID(); File stdout = TaskLog.getTaskLogFile(taskid, false, TaskLog.LogName.STDOUT); File stderr = TaskLog.getTaskLogFile(taskid, false, TaskLog.LogName.STDERR); long logLength = TaskLog.getTaskLogLength(conf); cmd = TaskLog.captureOutAndError(null, cmd, stdout, stderr, logLength, false); process = runClient(cmd, env); clientSocket = serverSocket.accept(); String challenge = getSecurityChallenge(); String digestToSend = createDigest(password, challenge); String digestExpected = createDigest(password, digestToSend); handler = new OutputHandler(context, input, digestExpected); K2 outputKey = (K2) ReflectionUtils.newInstance(context.getOutputKeyClass(), conf); V2 outputValue = (V2) ReflectionUtils.newInstance(context.getOutputValueClass(), conf); downlink = new BinaryProtocol(clientSocket, handler, outputKey, outputValue, conf); downlink.authenticate(digestToSend, challenge); waitForAuthentication(); LOG.debug("Authentication succeeded"); downlink.start(); downlink.setJobConf(conf); } private String getSecurityChallenge() { Random rand = new Random(System.currentTimeMillis()); //Use 4 random integers so as to have 16 random bytes. StringBuilder strBuilder = new StringBuilder(); strBuilder.append(rand.nextInt(0x7fffffff)); strBuilder.append(rand.nextInt(0x7fffffff)); strBuilder.append(rand.nextInt(0x7fffffff)); strBuilder.append(rand.nextInt(0x7fffffff)); return strBuilder.toString(); } private void writePasswordToLocalFile(String localPasswordFile, byte[] password, Configuration conf) throws IOException { FileSystem localFs = FileSystem.getLocal(conf); Path localPath = new Path(localPasswordFile); FSDataOutputStream out = FileSystem.create(localFs, localPath, new FsPermission("400")); out.write(password); out.close(); } /** * Get the downward protocol object that can send commands down to the * application. * @return the downlink proxy */ DownwardProtocol getDownlink() { return downlink; } /** * Wait for authentication response. * @throws IOException * @throws InterruptedException */ void waitForAuthentication() throws IOException, InterruptedException { downlink.flush(); LOG.debug("Waiting for authentication response"); handler.waitForAuthentication(); } /** * Wait for the application to finish * @return did the application finish correctly? * @throws Throwable */ boolean waitForFinish() throws Throwable { downlink.flush(); return handler.waitForFinish(); } /** * Abort the application and wait for it to finish. * @param t the exception that signalled the problem * @throws IOException A wrapper around the exception that was passed in */ void abort(Throwable t) throws IOException { LOG.info("Aborting because of " + StringUtils.stringifyException(t)); try { downlink.abort(); downlink.flush(); } catch (IOException e) { // IGNORE cleanup problems } try { handler.waitForFinish(); } catch (Throwable ignored) { process.destroy(); } IOException wrapper = new IOException("pipe child exception"); wrapper.initCause(t); throw wrapper; } /** * Clean up the child procress and socket. * @throws IOException */ void cleanup() throws IOException { serverSocket.close(); try { downlink.close(); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); } } /** * Run a given command in a subprocess, including threads to copy its stdout * and stderr to our stdout and stderr. * @param command the command and its arguments * @param env the environment to run the process in * @return a handle on the process * @throws IOException */ static Process runClient(List command, Map env) throws IOException { ProcessBuilder builder = new ProcessBuilder(command); if (env != null) { builder.environment().putAll(env); } Process result = builder.start(); return result; } public static String createDigest(byte[] password, String data) throws IOException { SecretKey key = JobTokenSecretManager.createSecretKey(password); return SecureShuffleUtils.hashFromString(data, key); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/BinaryProtocol.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.FileOutputStream; import java.io.FilterOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.Socket; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.util.StringUtils; /** * This protocol is a binary implementation of the Pipes protocol. */ class BinaryProtocol implements DownwardProtocol { public static final int CURRENT_PROTOCOL_VERSION = 0; /** * The buffer size for the command socket */ private static final int BUFFER_SIZE = 128*1024; private DataOutputStream stream; private DataOutputBuffer buffer = new DataOutputBuffer(); private static final Log LOG = LogFactory.getLog(BinaryProtocol.class.getName()); private UplinkReaderThread uplink; /** * The integer codes to represent the different messages. These must match * the external program codes or massive confusion will result. */ private static enum MessageType { START(0), SET_JOB_CONF(1), SET_INPUT_TYPES(2), RUN_MAP(3), MAP_ITEM(4), RUN_REDUCE(5), REDUCE_KEY(6), REDUCE_VALUE(7), CLOSE(8), ABORT(9), AUTHENTICATION_REQ(10), OUTPUT(50), PARTITIONED_OUTPUT(51), STATUS(52), PROGRESS(53), DONE(54), REGISTER_COUNTER(55), INCREMENT_COUNTER(56), AUTHENTICATION_RESP(57); final int code; MessageType(int code) { this.code = code; } } private static class UplinkReaderThread extends Thread { private DataInputStream inStream; private UpwardProtocol handler; private K2 key; private V2 value; private boolean authPending = true; public UplinkReaderThread(InputStream stream, UpwardProtocol handler, K2 key, V2 value) throws IOException{ inStream = new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE)); this.handler = handler; this.key = key; this.value = value; } public void closeConnection() throws IOException { inStream.close(); } public void run() { while (true) { try { if (Thread.currentThread().isInterrupted()) { throw new InterruptedException(); } int cmd = WritableUtils.readVInt(inStream); LOG.debug("Handling uplink command " + cmd); if (cmd == MessageType.AUTHENTICATION_RESP.code) { String digest = Text.readString(inStream); authPending = !handler.authenticate(digest); } else if (authPending) { LOG.warn("Message " + cmd + " received before authentication is " + "complete. Ignoring"); continue; } else if (cmd == MessageType.OUTPUT.code) { readObject(key); readObject(value); handler.output(key, value); } else if (cmd == MessageType.PARTITIONED_OUTPUT.code) { int part = WritableUtils.readVInt(inStream); readObject(key); readObject(value); handler.partitionedOutput(part, key, value); } else if (cmd == MessageType.STATUS.code) { handler.status(Text.readString(inStream)); } else if (cmd == MessageType.PROGRESS.code) { handler.progress(inStream.readFloat()); } else if (cmd == MessageType.REGISTER_COUNTER.code) { int id = WritableUtils.readVInt(inStream); String group = Text.readString(inStream); String name = Text.readString(inStream); handler.registerCounter(id, group, name); } else if (cmd == MessageType.INCREMENT_COUNTER.code) { int id = WritableUtils.readVInt(inStream); long amount = WritableUtils.readVLong(inStream); handler.incrementCounter(id, amount); } else if (cmd == MessageType.DONE.code) { LOG.debug("Pipe child done"); handler.done(); return; } else { throw new IOException("Bad command code: " + cmd); } } catch (InterruptedException e) { return; } catch (Throwable e) { LOG.error(StringUtils.stringifyException(e)); handler.failed(e); return; } } } private void readObject(Writable obj) throws IOException { int numBytes = WritableUtils.readVInt(inStream); byte[] buffer; // For BytesWritable and Text, use the specified length to set the length // this causes the "obvious" translations to work. So that if you emit // a string "abc" from C++, it shows up as "abc". if (obj instanceof BytesWritable) { buffer = new byte[numBytes]; inStream.readFully(buffer); ((BytesWritable) obj).set(buffer, 0, numBytes); } else if (obj instanceof Text) { buffer = new byte[numBytes]; inStream.readFully(buffer); ((Text) obj).set(buffer); } else { obj.readFields(inStream); } } } /** * An output stream that will save a copy of the data into a file. */ private static class TeeOutputStream extends FilterOutputStream { private OutputStream file; TeeOutputStream(String filename, OutputStream base) throws IOException { super(base); file = new FileOutputStream(filename); } public void write(byte b[], int off, int len) throws IOException { file.write(b,off,len); out.write(b,off,len); } public void write(int b) throws IOException { file.write(b); out.write(b); } public void flush() throws IOException { file.flush(); out.flush(); } public void close() throws IOException { flush(); file.close(); out.close(); } } /** * Create a proxy object that will speak the binary protocol on a socket. * Upward messages are passed on the specified handler and downward * downward messages are public methods on this object. * @param sock The socket to communicate on. * @param handler The handler for the received messages. * @param key The object to read keys into. * @param value The object to read values into. * @param config The job's configuration * @throws IOException */ public BinaryProtocol(Socket sock, UpwardProtocol handler, K2 key, V2 value, Configuration config) throws IOException { OutputStream raw = sock.getOutputStream(); // If we are debugging, save a copy of the downlink commands to a file if (Submitter.getKeepCommandFile(config)) { raw = new TeeOutputStream("downlink.data", raw); } stream = new DataOutputStream(new BufferedOutputStream(raw, BUFFER_SIZE)) ; uplink = new UplinkReaderThread(sock.getInputStream(), handler, key, value); uplink.setName("pipe-uplink-handler"); uplink.start(); } /** * Close the connection and shutdown the handler thread. * @throws IOException * @throws InterruptedException */ public void close() throws IOException, InterruptedException { LOG.debug("closing connection"); stream.close(); uplink.closeConnection(); uplink.interrupt(); uplink.join(); } public void authenticate(String digest, String challenge) throws IOException { LOG.debug("Sending AUTHENTICATION_REQ, digest=" + digest + ", challenge=" + challenge); WritableUtils.writeVInt(stream, MessageType.AUTHENTICATION_REQ.code); Text.writeString(stream, digest); Text.writeString(stream, challenge); } public void start() throws IOException { LOG.debug("starting downlink"); WritableUtils.writeVInt(stream, MessageType.START.code); WritableUtils.writeVInt(stream, CURRENT_PROTOCOL_VERSION); } public void setJobConf(Configuration conf) throws IOException { WritableUtils.writeVInt(stream, MessageType.SET_JOB_CONF.code); List list = new ArrayList(); for(Map.Entry itm: conf) { list.add(itm.getKey()); list.add(itm.getValue()); } WritableUtils.writeVInt(stream, list.size()); for(String entry: list){ Text.writeString(stream, entry); } } public void setInputTypes(String keyType, String valueType) throws IOException { WritableUtils.writeVInt(stream, MessageType.SET_INPUT_TYPES.code); Text.writeString(stream, keyType); Text.writeString(stream, valueType); } public void runMap(InputSplit split, int numReduces, boolean pipedInput) throws IOException { if (!Writable.class.isInstance(split)) { throw new RuntimeException("split is not Writable"); } WritableUtils.writeVInt(stream, MessageType.RUN_MAP.code); writeObject((Writable)split); WritableUtils.writeVInt(stream, numReduces); WritableUtils.writeVInt(stream, pipedInput ? 1 : 0); } public void mapItem(Writable key, Writable value) throws IOException { WritableUtils.writeVInt(stream, MessageType.MAP_ITEM.code); writeObject(key); writeObject(value); } public void runReduce(int reduce, boolean pipedOutput) throws IOException { WritableUtils.writeVInt(stream, MessageType.RUN_REDUCE.code); WritableUtils.writeVInt(stream, reduce); WritableUtils.writeVInt(stream, pipedOutput ? 1 : 0); } public void reduceKey(Writable key) throws IOException { WritableUtils.writeVInt(stream, MessageType.REDUCE_KEY.code); writeObject(key); } public void reduceValue(Writable value) throws IOException { WritableUtils.writeVInt(stream, MessageType.REDUCE_VALUE.code); writeObject(value); } public void endOfInput() throws IOException { WritableUtils.writeVInt(stream, MessageType.CLOSE.code); LOG.debug("Sent close command"); } public void abort() throws IOException { WritableUtils.writeVInt(stream, MessageType.ABORT.code); LOG.debug("Sent abort command"); } public void flush() throws IOException { stream.flush(); } /** * Write the given object to the stream. If it is a Text or BytesWritable, * write it directly. Otherwise, write it to a buffer and then write the * length and data to the stream. * @param obj the object to write * @throws IOException */ private void writeObject(Writable obj) throws IOException { // For Text and BytesWritable, encode them directly, so that they end up // in C++ as the natural translations. if (obj instanceof Text) { Text t = (Text) obj; int len = t.getLength(); WritableUtils.writeVInt(stream, len); stream.write(t.getBytes(), 0, len); } else if (obj instanceof BytesWritable) { BytesWritable b = (BytesWritable) obj; int len = b.getLength(); WritableUtils.writeVInt(stream, len); stream.write(b.getBytes(), 0, len); } else if (obj == null) { // write a zero length string WritableUtils.writeVInt(stream, 0); } else { buffer.reset(); obj.write(buffer); int length = buffer.getLength(); WritableUtils.writeVInt(stream, length); stream.write(buffer.getData(), 0, length); } } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/DownwardProtocol.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.InputSplit; /** * The abstract description of the downward (from Java to external program) * Pipes protocol. All of these calls are asynchronous and return before the * message has been processed. */ interface DownwardProtocol { /** * request authentication * @throws IOException */ void authenticate(String digest, String challenge) throws IOException; /** * Start communication * @throws IOException */ void start() throws IOException; /** * Set the Configuration for the task. * @param conf * @throws IOException */ void setJobConf(Configuration conf) throws IOException; /** * Set the input types for Maps. * @param keyType the name of the key's type * @param valueType the name of the value's type * @throws IOException */ void setInputTypes(String keyType, String valueType) throws IOException; /** * Run a map task in the child. * @param split The input split for this map. * @param numReduces The number of reduces for this job. * @param pipedInput Is the input coming from Java? * @throws IOException */ void runMap(InputSplit split, int numReduces, boolean pipedInput) throws IOException; /** * For maps with pipedInput, the key/value pairs are sent via this messaage. * @param key The record's key * @param value The record's value * @throws IOException */ void mapItem(K key, V value) throws IOException; /** * Run a reduce task in the child * @param reduce the index of the reduce (0 .. numReduces - 1) * @param pipedOutput is the output being sent to Java? * @throws IOException */ void runReduce(int reduce, boolean pipedOutput) throws IOException; /** * The reduce should be given a new key * @param key the new key * @throws IOException */ void reduceKey(K key) throws IOException; /** * The reduce should be given a new value * @param value the new value * @throws IOException */ void reduceValue(V value) throws IOException; /** * The task has no more input coming, but it should finish processing it's * input. * @throws IOException */ void endOfInput() throws IOException; /** * The task should stop as soon as possible, because something has gone wrong. * @throws IOException */ void abort() throws IOException; /** * Flush the data through any buffers. */ void flush() throws IOException; /** * Close the connection. */ void close() throws IOException, InterruptedException; } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/DummyRecordReader.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.RecordReader; public abstract class DummyRecordReader extends RecordReader { public abstract boolean next(FloatWritable key, NullWritable value) throws IOException ; } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/OpaqueSplit.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import org.apache.hadoop.mapred.SplitLocationInfo; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Writable; import java.io.IOException; import java.io.DataInput; import java.io.DataOutput; /** * An opaque piece of information to be handled on the client side. */ class OpaqueSplit extends InputSplit implements Writable { private BytesWritable payload; public OpaqueSplit() { payload = new BytesWritable(); } public OpaqueSplit(byte[] payload) { this.payload = new BytesWritable(payload); } public BytesWritable getPayload() { return payload; } @Override public long getLength() { return payload.getLength(); } @Override public String toString() { return payload.toString(); } @Override public String[] getLocations() throws IOException { return new String[]{}; } @Override public SplitLocationInfo[] getLocationInfo() throws IOException { return new SplitLocationInfo[]{}; } // Writable methods @Override public void write(DataOutput out) throws IOException { payload.write(out); } @Override public void readFields(DataInput in) throws IOException { payload.readFields(in); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/OutputHandler.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.TaskInputOutputContext; /** * Handles the upward (C++ to Java) messages from the application. */ class OutputHandler implements UpwardProtocol { private TaskInputOutputContext context; private float progressValue = 0.0f; private boolean done = false; private Throwable exception = null; //RecordReader recordReader = null; DummyRecordReader recordReader = null; private Map registeredCounters = new HashMap(); private String expectedDigest = null; private boolean digestReceived = false; /** * Create a handler that will handle any records output from the application. * @param context the actual input and output interface to the Java hadoop system. * @param expectedDigest */ public OutputHandler(TaskInputOutputContext context, DummyRecordReader recordReader, String expectedDigest) { this.context = context; this.recordReader = recordReader; this.expectedDigest = expectedDigest; } /** * The task output a normal record. */ @Override public void output(K key, V value) throws IOException, InterruptedException { context.write(key, value); } /** * The task output a record with a partition number attached. */ @Override public void partitionedOutput(int reduce, K key, V value) throws IOException, InterruptedException { PipesPartitioner.setNextPartition(reduce); context.write(key, value); } /** * Update the status message for the task. */ @Override public void status(String msg) { context.setStatus(msg); } private FloatWritable progressKey = new FloatWritable(0.0f); private NullWritable nullValue = NullWritable.get(); /** * Update the amount done and update above. */ @Override public void progress(float progress) throws IOException { progressValue = progress; context.progress(); if (recordReader != null) { progressKey.set(progress); recordReader.next(progressKey, nullValue); } } /** * The task finished successfully. */ @Override public void done() throws IOException { synchronized (this) { done = true; notify(); } } /** * Get the current amount done. * @return a float between 0.0 and 1.0 */ public float getProgress() { return progressValue; } /** * The task failed with an exception. */ public void failed(Throwable e) { synchronized (this) { exception = e; notify(); } } /** * Wait for the task to finish or abort. * @return did the task finish correctly? * @throws Throwable */ public synchronized boolean waitForFinish() throws Throwable { while (!done && exception == null) { wait(); } if (exception != null) { throw exception; } return done; } @Override public void registerCounter(int id, String group, String name) throws IOException { Counter counter = context.getCounter(group, name); registeredCounters.put(id, counter); } @Override public void incrementCounter(int id, long amount) throws IOException { if (id < registeredCounters.size()) { Counter counter = registeredCounters.get(id); counter.increment(amount); } else { throw new IOException("Invalid counter with id: " + id); } } public synchronized boolean authenticate(String digest) throws IOException { boolean success = true; if (!expectedDigest.equals(digest)) { exception = new IOException("Authentication Failed: Expected digest=" + expectedDigest + ", received=" + digestReceived); success = false; } digestReceived = true; notify(); return success; } /** * This is called by Application and blocks the thread until * authentication response is received. * @throws IOException * @throws InterruptedException */ synchronized void waitForAuthentication() throws IOException, InterruptedException { while (digestReceived == false && exception == null) { wait(); } if (exception != null) { throw new IOException(exception.getMessage()); } } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PipesMapper.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.InputFormat; /** * An adaptor to run an external mapper. */ class PipesMapper extends Mapper { protected static final Log LOG = LogFactory.getLog(PipesMapper.class); Context context; Application application = null; boolean skipping = false; @Override protected void setup(Context context) throws IOException, InterruptedException { this.context = context; //disable the auto increment of the counter. For pipes, no of processed //records could be different(equal or less) than the no of records input. // FIXME: disable right now... // SkipBadRecords.setAutoIncrMapperProcCount(context, false); } @Override protected void cleanup(Context context) throws IOException, InterruptedException { if (application != null) { application.cleanup(); } } @Override public void run(Context context) throws IOException, InterruptedException { setup(context); Configuration conf = context.getConfiguration(); InputSplit split = context.getInputSplit(); // FIXME: do we really need to be so convoluted? InputFormat inputFormat; try { inputFormat = (InputFormat) ReflectionUtils.newInstance(context.getInputFormatClass(), conf); } catch (ClassNotFoundException ce) { throw new RuntimeException("class not found", ce); } RecordReader input = inputFormat.createRecordReader(split, context); input.initialize(split, context); boolean isJavaInput = Submitter.getIsJavaRecordReader(conf); try { // FIXME: what happens for a java mapper and no java record reader? DummyRecordReader fakeInput = (!isJavaInput && !Submitter.getIsJavaMapper(conf)) ? (DummyRecordReader) input : null; application = new Application(context, fakeInput); } catch (InterruptedException ie) { throw new RuntimeException("interrupted", ie); } DownwardProtocol downlink = application.getDownlink(); downlink.runMap(context.getInputSplit(), context.getNumReduceTasks(), isJavaInput); boolean skipping = conf.getBoolean(context.SKIP_RECORDS, false); boolean sent_input_types = false; try { if (isJavaInput) { // FIXME while (input.nextKeyValue()) { if (!sent_input_types) { sent_input_types = true; NullWritable n = NullWritable.get(); String kclass_name = n.getClass().getName(); String vclass_name = n.getClass().getName(); if (input.getCurrentKey() != null) { kclass_name = input.getCurrentKey().getClass().getName(); } if (input.getCurrentValue() != null) { vclass_name = input.getCurrentValue().getClass().getName(); } downlink.setInputTypes(kclass_name, vclass_name); } downlink.mapItem(input.getCurrentKey(), input.getCurrentValue()); if(skipping) { //flush the streams on every record input if running in skip mode //so that we don't buffer other records surrounding a bad record. downlink.flush(); } } downlink.endOfInput(); } application.waitForFinish(); } catch (Throwable t) { application.abort(t); } finally { cleanup(context); } } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PipesNonJavaInputFormat.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.util.ReflectionUtils; /** * Dummy input format used when non-Java a {@link RecordReader} is used by * the Pipes' application. * * Sets up the Map-Reduce job to get the {@link PipesDummyRecordReader} and * the input splits. If pydoop.mapreduce.pipes.externalsplits.uri is * defined, input splits are read from the specified HDFS URI as a binary * sequence in the following format: ..., i.e., a * WritableInt N followed by N opaque objects. If it's not defined, input * splits are retrieved by invoking the getSplits method of the 'actual' * InputFormat specified by the user in mapreduce.pipes.inputformat. */ class PipesNonJavaInputFormat extends InputFormat { public List getSplits(JobContext context) throws IOException, InterruptedException { Properties props = Submitter.getPydoopProperties(); Configuration conf = context.getConfiguration(); String uri = conf.get(props.getProperty("PIPES_EXTERNALSPLITS_URI")); if (uri != null) { return getOpaqueSplits(conf, uri); } else { return ReflectionUtils.newInstance( conf.getClass(Submitter.INPUT_FORMAT, TextInputFormat.class, InputFormat.class), conf).getSplits(context); } } private List getOpaqueSplits(Configuration conf, String uri) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(conf); Path path = new Path(uri); if (!fs.exists(path)) { throw new IOException(uri + " does not exists"); } List splits = new ArrayList(); FSDataInputStream in = fs.open(path); try { IntWritable numRecords = new IntWritable(); numRecords.readFields(in); for(int i = 0; i < numRecords.get(); i++) { OpaqueSplit o = new OpaqueSplit(); o.readFields(in); splits.add(o); } } finally { in.close(); } return splits; } @Override public DummyRecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { return new PipesDummyRecordReader(split, context); } /** * A dummy {@link org.apache.hadoop.mapreduce.RecordReader} to help track the * progress of Hadoop Pipes applications when they are using a non-Java * RecordReader. * * The PipesDummyRecordReader is informed of the 'progress' of * the task by the {@link OutputHandler#progress(float)} which calls the * {@link #next(FloatWritable, NullWritable)} with the progress as the * key. */ static class PipesDummyRecordReader extends DummyRecordReader { float progress = 0.0f; public PipesDummyRecordReader() {} public PipesDummyRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { initialize(split, context); } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {} public synchronized void close() throws IOException {} @Override public float getProgress() throws IOException, InterruptedException { return progress; } @Override public boolean nextKeyValue() throws IOException, InterruptedException { return true; } @Override public FloatWritable getCurrentKey() throws IOException, InterruptedException { return new FloatWritable(progress); } @Override public NullWritable getCurrentValue() throws IOException, InterruptedException { return null; } @Override public synchronized boolean next(FloatWritable key, NullWritable value) throws IOException { progress = key.get(); return true; } } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PipesNonJavaOutputFormat.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * Ignores all output, but otherwise behaves like FileOutputFormat * (e.g., temp dir management). */ public class PipesNonJavaOutputFormat extends FileOutputFormat { @Override public RecordWriter getRecordWriter(TaskAttemptContext context) { return new RecordWriter() { public void write(K key, V value) { } public void close(TaskAttemptContext context) { } }; } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PipesPartitioner.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.ReflectionUtils; /** * This partitioner is one that can either be set manually per a record or it * can fall back onto a Java partitioner that was set by the user. */ class PipesPartitioner extends Partitioner implements Configurable { private static ThreadLocal cache = new ThreadLocal(); private Partitioner part = null; private Configuration conf; public void setConf(Configuration conf) { this.conf = conf; part = ReflectionUtils.newInstance( Submitter.getJavaPartitioner(conf), conf); } public Configuration getConf() { return conf; } /** * Set the next key to have the given partition. * @param newValue the next partition value */ static void setNextPartition(int newValue) { cache.set(newValue); } /** * If a partition result was set manually, return it. Otherwise, we call * the Java partitioner. * @param key the key to partition * @param value the value to partition * @param numPartitions the number of reduces */ @Override public int getPartition(K key, V value, int numPartitions) { Integer result = cache.get(); if (result == null) { return part.getPartition(key, value, numPartitions); } else { return result; } } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PipesReducer.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.TaskInputOutputContext; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.ReduceContext; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapred.SkipBadRecords; import java.io.IOException; import java.util.Iterator; /** * This class is used to talk to a C++ reduce task. */ class PipesReducer extends Reducer { private static final Log LOG = LogFactory.getLog(PipesReducer.class.getName()); private Context context; private Configuration configuration; private Application application = null; private DownwardProtocol downlink = null; private boolean isOk = true; @Override public void setup(Reducer.Context context) { this.context = context; this.configuration = this.context.getConfiguration(); } /** * Process all of the keys and values. Start up the application if we haven't * started it yet. */ @Override public void reduce(K2 key, Iterable values, Context context) throws IOException, InterruptedException { isOk = false; startApplication(); downlink.reduceKey(key); for(V2 value: values) { downlink.reduceValue(value); } isOk = true; } @SuppressWarnings("unchecked") private void startApplication() throws IOException { if (application == null) { try { LOG.info("starting application"); application = new Application(context, null); downlink = application.getDownlink(); } catch (InterruptedException ie) { throw new RuntimeException("interrupted", ie); } int reduce=0; downlink.runReduce(reduce, Submitter.getIsJavaRecordWriter(configuration)); } } /** * Handle the end of the input by closing down the application. */ @Override public void cleanup(Context context) throws IOException, InterruptedException { // if we haven't started the application, we have nothing to do if (isOk) { startApplication(); } try { if (isOk) { application.getDownlink().endOfInput(); } else { // send the abort to the application and let it clean up application.getDownlink().abort(); } LOG.info("waiting for finish"); application.waitForFinish(); LOG.info("got done"); } catch (Throwable t) { application.abort(t); } finally { application.cleanup(); } } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroBridgeKeyReader.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.util.Properties; import java.util.List; import java.util.Arrays; import java.io.IOException; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.conf.Configuration; import org.apache.avro.generic.IndexedRecord; public class PydoopAvroBridgeKeyReader extends PydoopAvroBridgeReaderBase { private Properties props; public PydoopAvroBridgeKeyReader( RecordReader actualReader) { this.actualReader = actualReader; props = Submitter.getPydoopProperties(); } protected List getInRecords() throws IOException, InterruptedException { IndexedRecord key = (IndexedRecord) actualReader.getCurrentKey(); return Arrays.asList(key); } public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { super.initialize(split, context); assert schemas.size() == 1; Configuration conf = context.getConfiguration(); conf.set(props.getProperty("AVRO_INPUT"), Submitter.AvroIO.K.name()); conf.set(props.getProperty("AVRO_KEY_INPUT_SCHEMA"), schemas.get(0).toString()); } @Override public Text getCurrentKey() throws IOException, InterruptedException { assert outRecords.size() == 1; return outRecords.get(0); } @Override public NullWritable getCurrentValue() throws IOException, InterruptedException { return NullWritable.get(); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroBridgeKeyValueReader.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.util.Properties; import java.util.List; import java.util.Arrays; import java.io.IOException; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.io.Text; import org.apache.hadoop.conf.Configuration; import org.apache.avro.generic.IndexedRecord; public class PydoopAvroBridgeKeyValueReader extends PydoopAvroBridgeReaderBase { private Properties props; public PydoopAvroBridgeKeyValueReader( RecordReader actualReader) { this.actualReader = actualReader; props = Submitter.getPydoopProperties(); } protected List getInRecords() throws IOException, InterruptedException { IndexedRecord key = (IndexedRecord) actualReader.getCurrentKey(); IndexedRecord value = (IndexedRecord) actualReader.getCurrentValue(); return Arrays.asList(key, value); } public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { super.initialize(split, context); assert schemas.size() == 2; Configuration conf = context.getConfiguration(); conf.set(props.getProperty("AVRO_INPUT"), Submitter.AvroIO.KV.name()); conf.set(props.getProperty("AVRO_KEY_INPUT_SCHEMA"), schemas.get(0).toString()); conf.set(props.getProperty("AVRO_VALUE_INPUT_SCHEMA"), schemas.get(1).toString()); } @Override public Text getCurrentKey() throws IOException, InterruptedException { assert outRecords.size() == 2; return outRecords.get(0); } @Override public Text getCurrentValue() throws IOException, InterruptedException { assert outRecords.size() == 2; return outRecords.get(1); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroBridgeKeyValueWriter.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.util.List; import java.util.Arrays; import java.io.IOException; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.io.Text; import org.apache.avro.generic.GenericRecord; public class PydoopAvroBridgeKeyValueWriter extends PydoopAvroBridgeWriterBase { public PydoopAvroBridgeKeyValueWriter( RecordWriter actualWriter, TaskAttemptContext context) { super(context, Submitter.AvroIO.KV); this.actualWriter = actualWriter; } public void write(Text key, Text value) throws IOException, InterruptedException { List outRecords = super.getOutRecords( Arrays.asList(key, value)); super.write(outRecords); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroBridgeKeyWriter.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.util.List; import java.util.Arrays; import java.io.IOException; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.avro.generic.GenericRecord; public class PydoopAvroBridgeKeyWriter extends PydoopAvroBridgeWriterBase { public PydoopAvroBridgeKeyWriter( RecordWriter actualWriter, TaskAttemptContext context) { super(context, Submitter.AvroIO.K); this.actualWriter = actualWriter; } public void write(Text key, Text ignore) throws IOException, InterruptedException { List outRecords = super.getOutRecords(Arrays.asList(key)); super.write(outRecords); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroBridgeReaderBase.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import java.io.IOException; import java.io.ByteArrayOutputStream; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.io.Text; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.io.DatumWriter; import org.apache.avro.io.EncoderFactory; import org.apache.avro.io.BinaryEncoder; public abstract class PydoopAvroBridgeReaderBase extends RecordReader { private static final String COUNTERS_GROUP = PydoopAvroBridgeReaderBase.class.getName(); protected RecordReader actualReader; protected List schemas; protected List outRecords; protected List> datumWriters; protected List encoders; protected List outStreams; protected Counter nRecords; protected Counter readTimeCounter; protected Counter serTimeCounter; private List bufferedInRecords; private long start; private boolean hasRecord; /** * Get current record(s) from the actual (input) RecordReader. * The returned list should contain one element for key-only or * value-only readers, two for key/value readers (this is not * enforced here, however). This method must NOT advance the actual * reader (it's the equivalent of getCurrent{Key,Value}, not of * nextKeyValue). */ protected abstract List getInRecords() throws IOException, InterruptedException; public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { actualReader.initialize(split, context); nRecords = context.getCounter(COUNTERS_GROUP, "Number of records"); readTimeCounter = context.getCounter(COUNTERS_GROUP, "Read time (ms)"); serTimeCounter = context.getCounter( COUNTERS_GROUP, "Serialization time (ms)"); // peek at the record stream and save the schema(s) so that the concrete // subclass can set the schema property during initialization start = System.nanoTime(); hasRecord = actualReader.nextKeyValue(); if (hasRecord) { readTimeCounter.increment((System.nanoTime() - start) / 1000000); bufferedInRecords = getInRecords(); schemas = new ArrayList(); datumWriters = new ArrayList>(); outStreams = new ArrayList(); encoders = new ArrayList(); outRecords = new ArrayList(); for (IndexedRecord r: bufferedInRecords) { Schema s = r.getSchema(); schemas.add(s); datumWriters.add(new GenericDatumWriter(s)); ByteArrayOutputStream stream = new ByteArrayOutputStream(); outStreams.add(stream); encoders.add(EncoderFactory.get().binaryEncoder(stream, null)); outRecords.add(new Text()); } } } public synchronized boolean nextKeyValue() throws IOException, InterruptedException { List records = null; if (bufferedInRecords == null) { start = System.nanoTime(); hasRecord = actualReader.nextKeyValue(); if (!hasRecord) { return false; } else { readTimeCounter.increment((System.nanoTime() - start) / 1000000); records = getInRecords(); } } else { records = bufferedInRecords; bufferedInRecords = null; } //-- Iterator iterRecords = records.iterator(); Iterator> iterWriters = datumWriters.iterator(); Iterator iterEncoders = encoders.iterator(); Iterator iterStreams = outStreams.iterator(); Iterator iterOutRecords = outRecords.iterator(); start = System.nanoTime(); while (iterRecords.hasNext()) { ByteArrayOutputStream stream = iterStreams.next(); BinaryEncoder enc = iterEncoders.next(); try { iterWriters.next().write(iterRecords.next(), enc); enc.flush(); } catch (IOException e) { throw new RuntimeException(e); } iterOutRecords.next().set(new Text(stream.toByteArray())); stream.reset(); } serTimeCounter.increment((System.nanoTime() - start) / 1000000); nRecords.increment(1); return true; } public float getProgress() throws IOException, InterruptedException { return actualReader.getProgress(); } public synchronized void close() throws IOException { actualReader.close(); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroBridgeValueReader.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.util.Properties; import java.util.List; import java.util.Arrays; import java.io.IOException; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.conf.Configuration; import org.apache.avro.generic.IndexedRecord; public class PydoopAvroBridgeValueReader extends PydoopAvroBridgeReaderBase { private Properties props; public PydoopAvroBridgeValueReader( RecordReader actualReader) { this.actualReader = actualReader; props = Submitter.getPydoopProperties(); } protected List getInRecords() throws IOException, InterruptedException { IndexedRecord value = (IndexedRecord) actualReader.getCurrentValue(); return Arrays.asList(value); } public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { super.initialize(split, context); assert schemas.size() == 1; Configuration conf = context.getConfiguration(); conf.set(props.getProperty("AVRO_INPUT"), Submitter.AvroIO.V.name()); conf.set(props.getProperty("AVRO_VALUE_INPUT_SCHEMA"), schemas.get(0).toString()); } @Override public NullWritable getCurrentKey() throws IOException, InterruptedException { return NullWritable.get(); } @Override public Text getCurrentValue() throws IOException, InterruptedException { assert outRecords.size() == 1; return outRecords.get(0); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroBridgeValueWriter.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.util.List; import java.util.Arrays; import java.io.IOException; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.avro.generic.GenericRecord; public class PydoopAvroBridgeValueWriter extends PydoopAvroBridgeWriterBase { public PydoopAvroBridgeValueWriter( RecordWriter actualWriter, TaskAttemptContext context) { super(context, Submitter.AvroIO.V); this.actualWriter = actualWriter; } public void write(Text ignore, Text value) throws IOException, InterruptedException { List outRecords = super.getOutRecords(Arrays.asList(value)); super.write(outRecords); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroBridgeWriterBase.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import java.util.Properties; import java.io.IOException; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.conf.Configuration; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.io.DatumReader; import org.apache.avro.io.DecoderFactory; import org.apache.avro.io.Decoder; import org.apache.avro.io.BinaryDecoder; import static it.crs4.pydoop.mapreduce.pipes.Submitter.AvroIO; public abstract class PydoopAvroBridgeWriterBase extends RecordWriter { private static final String COUNTERS_GROUP = PydoopAvroBridgeWriterBase.class.getName(); private long start; protected AvroIO mode; protected RecordWriter actualWriter; protected DecoderFactory decFactory; protected List> datumReaders; protected List decoders; protected List outRecords; protected Counter nRecords; protected Counter writeTimeCounter; protected Counter deserTimeCounter; public PydoopAvroBridgeWriterBase(TaskAttemptContext context, AvroIO mode) { Properties props = Submitter.getPydoopProperties(); Configuration conf = context.getConfiguration(); datumReaders = new ArrayList>(); decoders = new ArrayList(); outRecords = new ArrayList(); if (mode == AvroIO.K || mode == AvroIO.KV) { datumReaders.add(new GenericDatumReader(Schema.parse( conf.get(props.getProperty("AVRO_KEY_OUTPUT_SCHEMA"))))); decoders.add(null); outRecords.add(null); } if (mode == AvroIO.V || mode == AvroIO.KV) { datumReaders.add(new GenericDatumReader(Schema.parse( conf.get(props.getProperty("AVRO_VALUE_OUTPUT_SCHEMA"))))); decoders.add(null); outRecords.add(null); } decFactory = DecoderFactory.get(); this.mode = mode; //-- nRecords = context.getCounter(COUNTERS_GROUP, "Number of records"); writeTimeCounter = context.getCounter(COUNTERS_GROUP, "Write time (ms)"); deserTimeCounter = context.getCounter( COUNTERS_GROUP, "Deserialization time (ms)"); } protected List getOutRecords(List inRecords) throws IOException { start = System.nanoTime(); for (int i = 0; i < inRecords.size(); i++) { Decoder dec = decFactory.binaryDecoder( inRecords.get(i).getBytes(), (BinaryDecoder) decoders.get(i)); decoders.set(i, dec); outRecords.set(i, datumReaders.get(i).read(outRecords.get(i), dec)); } deserTimeCounter.increment((System.nanoTime() - start) / 1000000); return outRecords; } protected void write(List outRecords) throws IOException, InterruptedException { start = System.nanoTime(); switch (mode) { case K: actualWriter.write(outRecords.get(0), NullWritable.get()); break; case V: // Parquet writer does not accept a NullWritable key GenericRecord r = outRecords.get(0); actualWriter.write(null, r); break; case KV: actualWriter.write(outRecords.get(0), outRecords.get(1)); break; default: throw new RuntimeException("Invalid Avro I/O mode"); } writeTimeCounter.increment((System.nanoTime() - start) / 1000000); nRecords.increment(1); } public void close(TaskAttemptContext context) throws IOException, InterruptedException { actualWriter.close(context); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroInputBridgeBase.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.util.ReflectionUtils; public abstract class PydoopAvroInputBridgeBase extends InputFormat { protected InputFormat actualFormat; protected Class defaultActualFormat; protected InputFormat getActualFormat(Configuration conf) { if (actualFormat == null) { actualFormat = ReflectionUtils.newInstance( conf.getClass( Submitter.INPUT_FORMAT, defaultActualFormat, InputFormat.class), conf); } return actualFormat; } @Override public List getSplits(JobContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); return getActualFormat(conf).getSplits(context); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroInputKeyBridge.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; public class PydoopAvroInputKeyBridge extends PydoopAvroInputBridgeBase { public PydoopAvroInputKeyBridge() { defaultActualFormat = PydoopAvroKeyInputFormat.class; } @Override public RecordReader createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); return new PydoopAvroBridgeKeyReader( getActualFormat(conf).createRecordReader(split, context)); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroInputKeyValueBridge.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; public class PydoopAvroInputKeyValueBridge extends PydoopAvroInputBridgeBase { public PydoopAvroInputKeyValueBridge() { defaultActualFormat = PydoopAvroKeyValueInputFormat.class; } @Override public RecordReader createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); return new PydoopAvroBridgeKeyValueReader( getActualFormat(conf).createRecordReader(split, context)); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroInputValueBridge.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; public class PydoopAvroInputValueBridge extends PydoopAvroInputBridgeBase { public PydoopAvroInputValueBridge() { defaultActualFormat = PydoopAvroValueInputFormat.class; } @Override public RecordReader createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); return new PydoopAvroBridgeValueReader( getActualFormat(conf).createRecordReader(split, context)); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroKeyInputFormat.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; public class PydoopAvroKeyInputFormat extends FileInputFormat { @Override public RecordReader createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // null readerSchema: the reader will fall back to the writer schema // FIXME: we could add our own property for setting the reader schema return new PydoopAvroKeyRecordReader(null); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroKeyOutputFormat.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; public class PydoopAvroKeyOutputFormat extends PydoopAvroOutputFormatBase { @Override @SuppressWarnings("unchecked") public RecordWriter getRecordWriter( TaskAttemptContext context) throws IOException { return new PydoopAvroKeyRecordWriter( getOutputSchema(context, "AVRO_KEY_OUTPUT_SCHEMA"), getCompressionCodec(context), getAvroFileOutputStream(context) ); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroKeyRecordReader.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.io.NullWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class PydoopAvroKeyRecordReader extends PydoopAvroRecordReaderBase { private static final Logger LOG = LoggerFactory.getLogger( PydoopAvroKeyRecordReader.class); public PydoopAvroKeyRecordReader(Schema readerSchema) { super(readerSchema); } @Override public GenericRecord getCurrentKey() throws IOException, InterruptedException { return getCurrentRecord(); } @Override public NullWritable getCurrentValue() throws IOException, InterruptedException { return NullWritable.get(); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroKeyRecordWriter.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import java.io.OutputStream; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.file.CodecFactory; import org.apache.hadoop.io.NullWritable; public class PydoopAvroKeyRecordWriter extends PydoopAvroRecordWriterBase { public PydoopAvroKeyRecordWriter(Schema writerSchema, CodecFactory compressionCodec, OutputStream outputStream) throws IOException { super(writerSchema, compressionCodec, outputStream); } @Override public void write(GenericRecord record, NullWritable ignore) throws IOException { mAvroFileWriter.append(record); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroKeyValueInputFormat.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; public class PydoopAvroKeyValueInputFormat extends FileInputFormat { @Override public RecordReader createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // null readerSchema: the reader will fall back to the writer schema // FIXME: we could add our own property for setting the reader schema // FIXME: no distinction between top-level, key and value schema return new PydoopAvroKeyValueRecordReader(null); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroKeyValueOutputFormat.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.hadoop.io.AvroKeyValue; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; public class PydoopAvroKeyValueOutputFormat extends PydoopAvroOutputFormatBase { @Override @SuppressWarnings("unchecked") public RecordWriter getRecordWriter( TaskAttemptContext context) throws IOException { Schema keyValueSchema = AvroKeyValue.getSchema( getOutputSchema(context, "AVRO_KEY_OUTPUT_SCHEMA"), getOutputSchema(context, "AVRO_VALUE_OUTPUT_SCHEMA") ); return new PydoopAvroKeyValueRecordWriter( keyValueSchema, getCompressionCodec(context), getAvroFileOutputStream(context) ); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroKeyValueRecordReader.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; public class PydoopAvroKeyValueRecordReader extends PydoopAvroRecordReaderBase { public PydoopAvroKeyValueRecordReader(Schema readerSchema) { super(readerSchema); } @Override public GenericRecord getCurrentKey() throws IOException, InterruptedException { return (GenericRecord) getCurrentRecord().get("key"); } @Override public GenericRecord getCurrentValue() throws IOException, InterruptedException { return (GenericRecord) getCurrentRecord().get("value"); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroKeyValueRecordWriter.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import java.io.OutputStream; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericData; import org.apache.avro.file.CodecFactory; import org.apache.avro.hadoop.io.AvroKeyValue; public class PydoopAvroKeyValueRecordWriter extends PydoopAvroRecordWriterBase { private Schema keyValueSchema; public PydoopAvroKeyValueRecordWriter(Schema writerSchema, CodecFactory compressionCodec, OutputStream outputStream) throws IOException { super(writerSchema, compressionCodec, outputStream); keyValueSchema = writerSchema; } @Override public void write(GenericRecord key, GenericRecord value) throws IOException { AvroKeyValue kv = new AvroKeyValue( new GenericData.Record(keyValueSchema)); kv.setKey(key); kv.setValue(value); mAvroFileWriter.append(kv.get()); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroOutputBridgeBase.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.io.Text; public abstract class PydoopAvroOutputBridgeBase extends OutputFormat { protected OutputFormat actualFormat; protected Class defaultActualFormat; protected OutputFormat getActualFormat(Configuration conf) { if (actualFormat == null) { actualFormat = ReflectionUtils.newInstance( conf.getClass( Submitter.OUTPUT_FORMAT, defaultActualFormat, OutputFormat.class), conf); } return actualFormat; } @Override public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); getActualFormat(conf).checkOutputSpecs(context); } @Override public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); return getActualFormat(conf).getOutputCommitter(context); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroOutputFormatBase.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.util.Properties; import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.mapreduce.AvroOutputFormatBase; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.TaskAttemptContext; public abstract class PydoopAvroOutputFormatBase extends AvroOutputFormatBase { protected static Schema getOutputSchema( TaskAttemptContext context, String propName) throws IOException { Properties props = Submitter.getPydoopProperties(); Configuration conf = context.getConfiguration(); String schemaJSON = conf.get(props.getProperty(propName)); if (null == schemaJSON) { throw new IOException("Avro output requires an output schema"); } return Schema.parse(schemaJSON); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroOutputKeyBridge.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.io.Text; import org.apache.avro.Schema; public class PydoopAvroOutputKeyBridge extends PydoopAvroOutputBridgeBase { public PydoopAvroOutputKeyBridge() { defaultActualFormat = PydoopAvroKeyOutputFormat.class; } @Override public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { return new PydoopAvroBridgeKeyWriter( getActualFormat(context.getConfiguration()).getRecordWriter(context), context ); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroOutputKeyValueBridge.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.io.Text; import org.apache.avro.Schema; public class PydoopAvroOutputKeyValueBridge extends PydoopAvroOutputBridgeBase { public PydoopAvroOutputKeyValueBridge() { defaultActualFormat = PydoopAvroKeyValueOutputFormat.class; } @Override public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { return new PydoopAvroBridgeKeyValueWriter( getActualFormat(context.getConfiguration()).getRecordWriter(context), context ); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroOutputValueBridge.java ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.io.Text; import org.apache.avro.Schema; public class PydoopAvroOutputValueBridge extends PydoopAvroOutputBridgeBase { public PydoopAvroOutputValueBridge() { defaultActualFormat = PydoopAvroValueOutputFormat.class; } @Override public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { return new PydoopAvroBridgeValueWriter( getActualFormat(context.getConfiguration()).getRecordWriter(context), context ); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroRecordReaderBase.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.SeekableInput; import org.apache.avro.mapred.FsInput; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public abstract class PydoopAvroRecordReaderBase extends RecordReader { private static final Logger LOG = LoggerFactory.getLogger( PydoopAvroRecordReaderBase.class); private final Schema mReaderSchema; private GenericRecord mCurrentRecord; private DataFileReader mAvroFileReader; private long mStartPosition; private long mEndPosition; protected PydoopAvroRecordReaderBase(Schema readerSchema) { mReaderSchema = readerSchema; mCurrentRecord = null; } @Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { if (!(inputSplit instanceof FileSplit)) { throw new IllegalArgumentException("Only compatible with FileSplits."); } FileSplit fileSplit = (FileSplit) inputSplit; SeekableInput seekableFileInput = createSeekableInput( context.getConfiguration(), fileSplit.getPath()); mAvroFileReader = new DataFileReader(seekableFileInput, new GenericDatumReader(mReaderSchema)); // We will read the first block that begins after the input split // start; we will read up to but not including the first block // that begins after the input split end. mAvroFileReader.sync(fileSplit.getStart()); mStartPosition = mAvroFileReader.previousSync(); mEndPosition = fileSplit.getStart() + fileSplit.getLength(); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { assert null != mAvroFileReader; if (mAvroFileReader.hasNext() && !mAvroFileReader.pastSync(mEndPosition)) { mCurrentRecord = mAvroFileReader.next(mCurrentRecord); return true; } return false; } @Override public float getProgress() throws IOException, InterruptedException { assert null != mAvroFileReader; if (mEndPosition == mStartPosition) { return 0.0f; } long bytesRead = mAvroFileReader.previousSync() - mStartPosition; long bytesTotal = mEndPosition - mStartPosition; LOG.debug( "Progress: bytesRead=" + bytesRead + ", bytesTotal=" + bytesTotal); return Math.min(1.0f, (float) bytesRead / (float) bytesTotal); } @Override public void close() throws IOException { if (null != mAvroFileReader) { try { mAvroFileReader.close(); } finally { mAvroFileReader = null; } } } protected GenericRecord getCurrentRecord() { return mCurrentRecord; } protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException { return new FsInput(path, conf); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroRecordWriterBase.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import java.io.OutputStream; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.file.CodecFactory; import org.apache.avro.file.DataFileWriter; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; public abstract class PydoopAvroRecordWriterBase extends RecordWriter { protected final DataFileWriter mAvroFileWriter; protected PydoopAvroRecordWriterBase(Schema writerSchema, CodecFactory compressionCodec, OutputStream outputStream) throws IOException { mAvroFileWriter = new DataFileWriter( new GenericDatumWriter(writerSchema)); mAvroFileWriter.setCodec(compressionCodec); mAvroFileWriter.create(writerSchema, outputStream); } @Override public void close(TaskAttemptContext context) throws IOException { mAvroFileWriter.close(); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroValueInputFormat.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; public class PydoopAvroValueInputFormat extends FileInputFormat { @Override public RecordReader createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // null readerSchema: the reader will fall back to the writer schema // FIXME: we could add our own property for setting the reader schema return new PydoopAvroValueRecordReader(null); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroValueOutputFormat.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; public class PydoopAvroValueOutputFormat extends PydoopAvroOutputFormatBase { @Override @SuppressWarnings("unchecked") public RecordWriter getRecordWriter( TaskAttemptContext context) throws IOException { return new PydoopAvroValueRecordWriter( getOutputSchema(context, "AVRO_VALUE_OUTPUT_SCHEMA"), getCompressionCodec(context), getAvroFileOutputStream(context) ); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroValueRecordReader.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.io.NullWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class PydoopAvroValueRecordReader extends PydoopAvroRecordReaderBase { private static final Logger LOG = LoggerFactory.getLogger( PydoopAvroValueRecordReader.class); public PydoopAvroValueRecordReader(Schema readerSchema) { super(readerSchema); } @Override public NullWritable getCurrentKey() throws IOException, InterruptedException { return NullWritable.get(); } @Override public GenericRecord getCurrentValue() throws IOException, InterruptedException { return getCurrentRecord(); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/PydoopAvroValueRecordWriter.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import java.io.OutputStream; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.file.CodecFactory; import org.apache.hadoop.io.NullWritable; public class PydoopAvroValueRecordWriter extends PydoopAvroRecordWriterBase { public PydoopAvroValueRecordWriter(Schema writerSchema, CodecFactory compressionCodec, OutputStream outputStream) throws IOException { super(writerSchema, compressionCodec, outputStream); } @Override public void write(NullWritable ignore, GenericRecord record) throws IOException { mAvroFileWriter.append(record); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/Submitter.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.util.Properties; import java.io.InputStream; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.net.URLClassLoader; import java.security.AccessController; import java.security.PrivilegedAction; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.Parser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.io.Text; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; import org.apache.hadoop.mapreduce.filecache.DistributedCache; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; /** * A command line parser for the CLI-based Pipes job submitter. */ class CommandLineParser { private Options options = new Options(); CommandLineParser() { addOption("input", false, "input path to the maps", "path"); addOption("output", false, "output path from the reduces", "path"); addOption("jar", false, "job jar file", "path"); addOption("inputformat", false, "java classname of InputFormat", "class"); addOption("map", false, "java classname of Mapper", "class"); addOption("partitioner", false, "java classname of Partitioner", "class"); addOption("reduce", false, "java classname of Reducer", "class"); addOption("writer", false, "java classname of OutputFormat", "class"); addOption("program", false, "URI to application executable", "class"); addOption("reduces", false, "number of reduces", "num"); addOption("lazyOutput", false, "Optional. Create output lazily", "boolean"); addOption("avroInput", false, "avro input mode", "boolean"); addOption("avroOutput", false, "avro output mode", "boolean"); } void addOption(String longName, boolean required, String description, String paramName) { Option option = OptionBuilder.withArgName(paramName) .hasArgs(1).withDescription(description) .isRequired(required).create(longName); options.addOption(option); } void addArgument(String name, boolean required, String description) { Option option = OptionBuilder.withArgName(name) .hasArgs(1).withDescription(description) .isRequired(required).create(); options.addOption(option); } CommandLine parse(Configuration conf, String[] args) throws IOException, ParseException { Parser parser = new BasicParser(); conf.setBoolean("mapreduce.client.genericoptionsparser.used", true); GenericOptionsParser genericParser = new GenericOptionsParser(conf, args); return parser.parse(options, genericParser.getRemainingArgs()); } void printUsage() { // The CLI package should do this for us, but I can't figure out how // to make it print something reasonable. System.out.println("bin/hadoop pipes"); System.out.println(" [-input ] // Input directory"); System.out.println(" [-output ] // Output directory"); System.out.println(" [-jar // jar filename"); System.out.println(" [-inputformat ] // InputFormat class"); System.out.println(" [-map ] // Java Map class"); System.out.println(" [-partitioner ] // Java Partitioner"); System.out.println(" [-reduce ] // Java Reduce class"); System.out.println(" [-writer ] // Java RecordWriter"); System.out.println(" [-program ] // executable URI"); System.out.println(" [-reduces ] // number of reduces"); System.out.println(" [-lazyOutput ] // createOutputLazily"); System.out.println(" [-avroInput ] // avro input"); System.out.println(" [-avroOutput ] // avro output"); System.out.println(); GenericOptionsParser.printGenericCommandUsage(System.out); } } public class Submitter extends Configured implements Tool { public static enum AvroIO { K, // {Input,Output}Format key type is avro record V, // {Input,Output}Format value type is avro record KV, // {Input,Output}Format {key,value} type is avro record } protected static final Log LOG = LogFactory.getLog(Submitter.class); protected static final String PROP_FILE = "pydoop.properties"; protected static AvroIO avroInput; protected static AvroIO avroOutput; protected static boolean explicitInputFormat = false; protected static boolean explicitOutputFormat = false; // --- pydoop properties --- protected static Properties props; public static final String PRESERVE_COMMANDFILE = "mapreduce.pipes.commandfile.preserve"; public static final String EXECUTABLE = "mapreduce.pipes.executable"; public static final String INTERPRETOR = "mapreduce.pipes.executable.interpretor"; public static final String IS_JAVA_MAP = "mapreduce.pipes.isjavamapper"; public static final String IS_JAVA_RR = "mapreduce.pipes.isjavarecordreader"; public static final String IS_JAVA_RW = "mapreduce.pipes.isjavarecordwriter"; public static final String IS_JAVA_REDUCE = "mapreduce.pipes.isjavareducer"; public static final String PARTITIONER = "mapreduce.pipes.partitioner"; public static final String INPUT_FORMAT = "mapreduce.pipes.inputformat"; public static final String OUTPUT_FORMAT = "mapreduce.pipes.outputformat"; public static final String PORT = "mapreduce.pipes.command.port"; public static Properties getPydoopProperties() { Properties properties = new Properties(); InputStream stream = Submitter.class.getResourceAsStream(PROP_FILE); try { properties.load(stream); stream.close(); } catch (NullPointerException e) { throw new RuntimeException("Could not find " + PROP_FILE); } catch (IOException e) { throw new RuntimeException("Could not read " + PROP_FILE); } return properties; } public Submitter() { super(); props = getPydoopProperties(); } public static boolean isLocalFS(Configuration conf) throws IOException { return FileSystem.get(conf).equals(FileSystem.getLocal(conf)); } /** * Get the URI of the application's executable. * @param conf * @return the URI where the application's executable is located */ public static String getExecutable(Configuration conf) { return conf.get(Submitter.EXECUTABLE); } /** * Set the URI for the application's executable. Normally this is a hdfs: * location. * @param conf * @param executable The URI of the application's executable. */ public static void setExecutable(Configuration conf, String executable) { conf.set(Submitter.EXECUTABLE, executable); } /** * Set whether the job is using a Java RecordReader. * @param conf the configuration to modify * @param value the new value */ public static void setIsJavaRecordReader(Configuration conf, boolean value) { conf.setBoolean(Submitter.IS_JAVA_RR, value); } /** * Check whether the job is using a Java RecordReader * @param conf the configuration to check * @return is it a Java RecordReader? */ public static boolean getIsJavaRecordReader(Configuration conf) { return conf.getBoolean(Submitter.IS_JAVA_RR, false); } /** * Set whether the Mapper is written in Java. * @param conf the configuration to modify * @param value the new value */ public static void setIsJavaMapper(Configuration conf, boolean value) { conf.setBoolean(Submitter.IS_JAVA_MAP, value); } /** * Check whether the job is using a Java Mapper. * @param conf the configuration to check * @return is it a Java Mapper? */ public static boolean getIsJavaMapper(Configuration conf) { return conf.getBoolean(Submitter.IS_JAVA_MAP, false); } /** * Set whether the Reducer is written in Java. * @param conf the configuration to modify * @param value the new value */ public static void setIsJavaReducer(Configuration conf, boolean value) { conf.setBoolean(Submitter.IS_JAVA_REDUCE, value); } /** * Check whether the job is using a Java Reducer. * @param conf the configuration to check * @return is it a Java Reducer? */ public static boolean getIsJavaReducer(Configuration conf) { return conf.getBoolean(Submitter.IS_JAVA_REDUCE, false); } /** * Set whether the job will use a Java RecordWriter. * @param conf the configuration to modify * @param value the new value to set */ public static void setIsJavaRecordWriter(Configuration conf, boolean value) { conf.setBoolean(Submitter.IS_JAVA_RW, value); } /** * Will the reduce use a Java RecordWriter? * @param conf the configuration to check * @return true, if the output of the job will be written by Java */ public static boolean getIsJavaRecordWriter(Configuration conf) { return conf.getBoolean(Submitter.IS_JAVA_RW, false); } /** * Set the configuration, if it doesn't already have a value for the given * key. * @param conf the configuration to modify * @param key the key to set * @param value the new "default" value to set */ private static void setIfUnset(Configuration conf, String key, String value) { if (conf.get(key) == null) { conf.set(key, value); } } /** * Save away the user's original partitioner before we override it. * @param conf the configuration to modify * @param cls the user's partitioner class */ static void setJavaPartitioner(Configuration conf, Class cls) { conf.set(Submitter.PARTITIONER, cls.getName()); } /** * Get the user's original partitioner. * @param conf the configuration to look in * @return the class that the user submitted */ static Class getJavaPartitioner(Configuration conf) { return conf.getClass(Submitter.PARTITIONER, HashPartitioner.class, Partitioner.class); } private static Class getClass(CommandLine cl, String key, Configuration conf, Class cls) throws ClassNotFoundException { return conf.getClassByName(cl.getOptionValue(key)).asSubclass(cls); } /** * Does the user want to keep the command file for debugging? If * this is true, pipes will write a copy of the command data to a * file in the task directory named "downlink.data", which may be * used to run the C++ program under the debugger. You probably also * want to set Configuration.setKeepFailedTaskFiles(true) to keep * the entire directory from being deleted. To run using the data * file, set the environment variable "mapreduce.pipes.commandfile" * to point to the file. * @param conf the configuration to check * @return will the framework save the command file? */ public static boolean getKeepCommandFile(Configuration conf) { return conf.getBoolean(Submitter.PRESERVE_COMMANDFILE, false); } /** * Set whether to keep the command file for debugging * @param conf the configuration to modify * @param keep the new value */ public static void setKeepCommandFile(Configuration conf, boolean keep) { conf.setBoolean(Submitter.PRESERVE_COMMANDFILE, keep); } private static void setupPipesJob(Job job) throws IOException, ClassNotFoundException, URISyntaxException { Configuration conf = job.getConfiguration(); // -libjars does not work when running on the local FS if (isLocalFS(conf)) { URL[] libjars = GenericOptionsParser.getLibJars(conf); for (URL jarUrl: libjars) { job.addFileToClassPath(new Path(jarUrl.toURI())); } } // default map output types to Text if (!getIsJavaMapper(conf)) { job.setMapperClass(PipesMapper.class); // Save the user's partitioner and hook in our's. setJavaPartitioner(conf, job.getPartitionerClass()); job.setPartitionerClass(PipesPartitioner.class); } if (!getIsJavaReducer(conf)) { job.setReducerClass(PipesReducer.class); if (!getIsJavaRecordWriter(conf)) { job.setOutputFormatClass(PipesNonJavaOutputFormat.class); } } String textClassname = Text.class.getName(); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname); // Use PipesNonJavaInputFormat if necessary to handle progress reporting // from C++ RecordReaders ... if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) { conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class); job.setInputFormatClass(PipesNonJavaInputFormat.class); } if (avroInput != null) { if (explicitInputFormat) { conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class); } // else let the bridge fall back to the appropriate Avro IF switch (avroInput) { case K: job.setInputFormatClass(PydoopAvroInputKeyBridge.class); break; case V: job.setInputFormatClass(PydoopAvroInputValueBridge.class); break; case KV: job.setInputFormatClass(PydoopAvroInputKeyValueBridge.class); break; default: throw new IllegalArgumentException("Bad Avro input type"); } } if (avroOutput != null) { if (explicitOutputFormat) { conf.setClass(Submitter.OUTPUT_FORMAT, job.getOutputFormatClass(), OutputFormat.class); } // else let the bridge fall back to the appropriate Avro OF conf.set(props.getProperty("AVRO_OUTPUT"), avroOutput.name()); switch (avroOutput) { case K: job.setOutputFormatClass(PydoopAvroOutputKeyBridge.class); break; case V: job.setOutputFormatClass(PydoopAvroOutputValueBridge.class); break; case KV: job.setOutputFormatClass(PydoopAvroOutputKeyValueBridge.class); break; default: throw new IllegalArgumentException("Bad Avro output type"); } } String exec = getExecutable(conf); if (exec == null) { String msg = "No application program defined."; throw new IllegalArgumentException(msg); } // add default debug script only when executable is expressed as // # //FIXME: this is kind of useless if the pipes program is not in c++ if (exec.contains("#")) { // set default gdb commands for map and reduce task String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script"; setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT,defScript); setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT,defScript); } URI[] fileCache = DistributedCache.getCacheFiles(conf); if (fileCache == null) { fileCache = new URI[1]; } else { URI[] tmp = new URI[fileCache.length+1]; System.arraycopy(fileCache, 0, tmp, 1, fileCache.length); fileCache = tmp; } try { fileCache[0] = new URI(exec); } catch (URISyntaxException e) { String msg = "Problem parsing executable URI " + exec; IOException ie = new IOException(msg); ie.initCause(e); throw ie; } DistributedCache.setCacheFiles(fileCache, conf); } public int run(String[] args) throws Exception { CommandLineParser cli = new CommandLineParser(); if (args.length == 0) { cli.printUsage(); return 1; } try { Job job = new Job(new Configuration()); job.setJobName(getClass().getName()); Configuration conf = job.getConfiguration(); CommandLine results = cli.parse(conf, args); if (results.hasOption("input")) { Path path = new Path(results.getOptionValue("input")); FileInputFormat.setInputPaths(job, path); } if (results.hasOption("output")) { Path path = new Path(results.getOptionValue("output")); FileOutputFormat.setOutputPath(job,path); } if (results.hasOption("jar")) { job.setJar(results.getOptionValue("jar")); } if (results.hasOption("inputformat")) { explicitInputFormat = true; setIsJavaRecordReader(conf, true); job.setInputFormatClass(getClass(results, "inputformat", conf, InputFormat.class)); } if (results.hasOption("javareader")) { setIsJavaRecordReader(conf, true); } if (results.hasOption("map")) { setIsJavaMapper(conf, true); job.setMapperClass(getClass(results, "map", conf, Mapper.class)); } if (results.hasOption("partitioner")) { job.setPartitionerClass(getClass(results, "partitioner", conf, Partitioner.class)); } if (results.hasOption("reduce")) { setIsJavaReducer(conf, true); job.setReducerClass(getClass(results, "reduce", conf, Reducer.class)); } if (results.hasOption("reduces")) { job.setNumReduceTasks(Integer.parseInt( results.getOptionValue("reduces"))); } if (results.hasOption("writer")) { explicitOutputFormat = true; setIsJavaRecordWriter(conf, true); job.setOutputFormatClass(getClass(results, "writer", conf, OutputFormat.class)); } if (results.hasOption("lazyOutput")) { if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) { LazyOutputFormat.setOutputFormatClass( job, job.getOutputFormatClass()); } } if (results.hasOption("avroInput")) { avroInput = AvroIO.valueOf( results.getOptionValue("avroInput").toUpperCase()); } if (results.hasOption("avroOutput")) { avroOutput = AvroIO.valueOf( results.getOptionValue("avroOutput").toUpperCase()); } if (results.hasOption("program")) { setExecutable(conf, results.getOptionValue("program")); } // if they gave us a jar file, include it into the class path String jarFile = job.getJar(); if (jarFile != null) { final URL[] urls = new URL[] { FileSystem.getLocal(conf).pathToFile(new Path(jarFile)).toURL() }; // FindBugs complains that creating a URLClassLoader should be // in a doPrivileged() block. ClassLoader loader = AccessController.doPrivileged( new PrivilegedAction() { public ClassLoader run() {return new URLClassLoader(urls);} } ); conf.setClassLoader(loader); } setupPipesJob(job); return job.waitForCompletion(true) ? 0 : 1; } catch (ParseException pe) { LOG.info("Error : " + pe); cli.printUsage(); return 1; } } public static void main(String[] args) throws Exception { int exitCode = new Submitter().run(args); ExitUtil.terminate(exitCode); } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/TaskLog.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.Flushable; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Enumeration; import java.util.List; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SecureIOUtils; import org.apache.hadoop.mapreduce.JobID; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.TaskID; import org.apache.hadoop.mapreduce.util.ProcessTree; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.ShutdownHookManager; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.log4j.Appender; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.lang.reflect.Field; import com.google.common.base.Charsets; /** * A simple logger to handle the task-specific user logs. * This class uses the system property hadoop.log.dir. * */ @InterfaceAudience.Private public class TaskLog { private static final Log LOG = LogFactory.getLog(TaskLog.class); static final String USERLOGS_DIR_NAME = "userlogs"; private static final File LOG_DIR = new File(getBaseLogDir(), USERLOGS_DIR_NAME).getAbsoluteFile(); // localFS is set in (and used by) writeToIndexFile() static LocalFileSystem localFS = null; private static String getYarnAppContainerLogDir(){ try{ Field field = YarnConfiguration.class.getField("YARN_APP_CONTAINER_LOG_DIR"); if(field!=null) return (String) field.get(null); }catch(Exception e){} return "yarn.app.container.log.dir"; } public static String getMRv2LogDir() { return System.getProperty(getYarnAppContainerLogDir()); } public static File getTaskLogFile(TaskAttemptID taskid, boolean isCleanup, LogName filter) { if (getMRv2LogDir() != null) { return new File(getMRv2LogDir(), filter.toString()); } else { return new File(getAttemptDir(taskid, isCleanup), filter.toString()); } } static File getRealTaskLogFileLocation(TaskAttemptID taskid, boolean isCleanup, LogName filter) { LogFileDetail l; try { l = getLogFileDetail(taskid, filter, isCleanup); } catch (IOException ie) { LOG.error("getTaskLogFileDetail threw an exception " + ie); return null; } return new File(l.location, filter.toString()); } private static class LogFileDetail { final static String LOCATION = "LOG_DIR:"; String location; long start; long length; } private static LogFileDetail getLogFileDetail(TaskAttemptID taskid, LogName filter, boolean isCleanup) throws IOException { File indexFile = getIndexFile(taskid, isCleanup); BufferedReader fis = new BufferedReader(new InputStreamReader( SecureIOUtils.openForRead(indexFile, obtainLogDirOwner(taskid), null), Charsets.UTF_8)); //the format of the index file is //LOG_DIR: //stdout: //stderr: //syslog: LogFileDetail l = new LogFileDetail(); String str = null; try { str = fis.readLine(); if (str == null) { // the file doesn't have anything throw new IOException("Index file for the log of " + taskid + " doesn't exist."); } l.location = str.substring(str.indexOf(LogFileDetail.LOCATION) + LogFileDetail.LOCATION.length()); // special cases are the debugout and profile.out files. They are // guaranteed // to be associated with each task attempt since jvm reuse is disabled // when profiling/debugging is enabled if (filter.equals(LogName.DEBUGOUT) || filter.equals(LogName.PROFILE)) { l.length = new File(l.location, filter.toString()).length(); l.start = 0; fis.close(); return l; } str = fis.readLine(); while (str != null) { // look for the exact line containing the logname if (str.contains(filter.toString())) { str = str.substring(filter.toString().length() + 1); String[] startAndLen = str.split(" "); l.start = Long.parseLong(startAndLen[0]); l.length = Long.parseLong(startAndLen[1]); break; } str = fis.readLine(); } fis.close(); fis = null; } finally { IOUtils.cleanup(LOG, fis); } return l; } private static File getTmpIndexFile(TaskAttemptID taskid, boolean isCleanup) { return new File(getAttemptDir(taskid, isCleanup), "log.tmp"); } static File getIndexFile(TaskAttemptID taskid, boolean isCleanup) { return new File(getAttemptDir(taskid, isCleanup), "log.index"); } /** * Obtain the owner of the log dir. This is * determined by checking the job's log directory. */ static String obtainLogDirOwner(TaskAttemptID taskid) throws IOException { Configuration conf = new Configuration(); FileSystem raw = FileSystem.getLocal(conf).getRaw(); Path jobLogDir = new Path(getJobDir(taskid.getJobID()).getAbsolutePath()); FileStatus jobStat = raw.getFileStatus(jobLogDir); return jobStat.getOwner(); } static String getBaseLogDir() { return System.getProperty("hadoop.log.dir"); } static File getAttemptDir(TaskAttemptID taskid, boolean isCleanup) { String cleanupSuffix = isCleanup ? ".cleanup" : ""; return new File(getJobDir(taskid.getJobID()), taskid + cleanupSuffix); } private static long prevOutLength; private static long prevErrLength; private static long prevLogLength; private static synchronized void writeToIndexFile(String logLocation, boolean isCleanup) throws IOException { // To ensure atomicity of updates to index file, write to temporary index // file first and then rename. File tmpIndexFile = getTmpIndexFile(currentTaskid, isCleanup); BufferedOutputStream bos = new BufferedOutputStream( SecureIOUtils.createForWrite(tmpIndexFile, 0644)); DataOutputStream dos = new DataOutputStream(bos); //the format of the index file is //LOG_DIR: //STDOUT: //STDERR: //SYSLOG: try{ dos.writeBytes(LogFileDetail.LOCATION + logLocation + "\n" + LogName.STDOUT.toString() + ":"); dos.writeBytes(Long.toString(prevOutLength) + " "); dos.writeBytes(Long.toString(new File(logLocation, LogName.STDOUT .toString()).length() - prevOutLength) + "\n" + LogName.STDERR + ":"); dos.writeBytes(Long.toString(prevErrLength) + " "); dos.writeBytes(Long.toString(new File(logLocation, LogName.STDERR .toString()).length() - prevErrLength) + "\n" + LogName.SYSLOG.toString() + ":"); dos.writeBytes(Long.toString(prevLogLength) + " "); dos.writeBytes(Long.toString(new File(logLocation, LogName.SYSLOG .toString()).length() - prevLogLength) + "\n"); dos.close(); dos = null; } finally { IOUtils.cleanup(LOG, dos); } File indexFile = getIndexFile(currentTaskid, isCleanup); Path indexFilePath = new Path(indexFile.getAbsolutePath()); Path tmpIndexFilePath = new Path(tmpIndexFile.getAbsolutePath()); if (localFS == null) {// set localFS once localFS = FileSystem.getLocal(new Configuration()); } localFS.rename (tmpIndexFilePath, indexFilePath); } private static void resetPrevLengths(String logLocation) { prevOutLength = new File(logLocation, LogName.STDOUT.toString()).length(); prevErrLength = new File(logLocation, LogName.STDERR.toString()).length(); prevLogLength = new File(logLocation, LogName.SYSLOG.toString()).length(); } private volatile static TaskAttemptID currentTaskid = null; @SuppressWarnings("unchecked") public synchronized static void syncLogs(String logLocation, TaskAttemptID taskid, boolean isCleanup) throws IOException { System.out.flush(); System.err.flush(); Enumeration allLoggers = LogManager.getCurrentLoggers(); while (allLoggers.hasMoreElements()) { Logger l = allLoggers.nextElement(); Enumeration allAppenders = l.getAllAppenders(); while (allAppenders.hasMoreElements()) { Appender a = allAppenders.nextElement(); if (a instanceof TaskLogAppender) { ((TaskLogAppender)a).flush(); } } } if (currentTaskid != taskid) { currentTaskid = taskid; resetPrevLengths(logLocation); } writeToIndexFile(logLocation, isCleanup); } public static synchronized void syncLogsShutdown( ScheduledExecutorService scheduler) { // flush standard streams // System.out.flush(); System.err.flush(); if (scheduler != null) { scheduler.shutdownNow(); } // flush & close all appenders LogManager.shutdown(); } @SuppressWarnings("unchecked") public static synchronized void syncLogs() { // flush standard streams // System.out.flush(); System.err.flush(); // flush flushable appenders // final Logger rootLogger = Logger.getRootLogger(); flushAppenders(rootLogger); final Enumeration allLoggers = rootLogger.getLoggerRepository(). getCurrentLoggers(); while (allLoggers.hasMoreElements()) { final Logger l = allLoggers.nextElement(); flushAppenders(l); } } @SuppressWarnings("unchecked") private static void flushAppenders(Logger l) { final Enumeration allAppenders = l.getAllAppenders(); while (allAppenders.hasMoreElements()) { final Appender a = allAppenders.nextElement(); if (a instanceof Flushable) { try { ((Flushable) a).flush(); } catch (IOException ioe) { System.err.println(a + ": Failed to flush!" + StringUtils.stringifyException(ioe)); } } } } public static ScheduledExecutorService createLogSyncer() { final ScheduledExecutorService scheduler = Executors.newSingleThreadScheduledExecutor( new ThreadFactory() { @Override public Thread newThread(Runnable r) { final Thread t = Executors.defaultThreadFactory().newThread(r); t.setDaemon(true); t.setName("Thread for syncLogs"); return t; } }); ShutdownHookManager.get().addShutdownHook(new Runnable() { @Override public void run() { TaskLog.syncLogsShutdown(scheduler); } }, 50); scheduler.scheduleWithFixedDelay( new Runnable() { @Override public void run() { TaskLog.syncLogs(); } }, 0L, 5L, TimeUnit.SECONDS); return scheduler; } /** * The filter for userlogs. */ @InterfaceAudience.Private public static enum LogName { /** Log on the stdout of the task. */ STDOUT ("stdout"), /** Log on the stderr of the task. */ STDERR ("stderr"), /** Log on the map-reduce system logs of the task. */ SYSLOG ("syslog"), /** The java profiler information. */ PROFILE ("profile.out"), /** Log the debug script's stdout */ DEBUGOUT ("debugout"); private String prefix; private LogName(String prefix) { this.prefix = prefix; } @Override public String toString() { return prefix; } } public static class Reader extends InputStream { private long bytesRemaining; private FileInputStream file; /** * Read a log file from start to end positions. The offsets may be negative, * in which case they are relative to the end of the file. For example, * Reader(taskid, kind, 0, -1) is the entire file and * Reader(taskid, kind, -4197, -1) is the last 4196 bytes. * @param taskid the id of the task to read the log file for * @param kind the kind of log to read * @param start the offset to read from (negative is relative to tail) * @param end the offset to read upto (negative is relative to tail) * @param isCleanup whether the attempt is cleanup attempt or not * @throws IOException */ public Reader(TaskAttemptID taskid, LogName kind, long start, long end, boolean isCleanup) throws IOException { // find the right log file LogFileDetail fileDetail = getLogFileDetail(taskid, kind, isCleanup); // calculate the start and stop long size = fileDetail.length; if (start < 0) { start += size + 1; } if (end < 0) { end += size + 1; } start = Math.max(0, Math.min(start, size)); end = Math.max(0, Math.min(end, size)); start += fileDetail.start; end += fileDetail.start; bytesRemaining = end - start; String owner = obtainLogDirOwner(taskid); file = SecureIOUtils.openForRead(new File(fileDetail.location, kind.toString()), owner, null); // skip upto start long pos = 0; while (pos < start) { long result = file.skip(start - pos); if (result < 0) { bytesRemaining = 0; break; } pos += result; } } @Override public int read() throws IOException { int result = -1; if (bytesRemaining > 0) { bytesRemaining -= 1; result = file.read(); } return result; } @Override public int read(byte[] buffer, int offset, int length) throws IOException { length = (int) Math.min(length, bytesRemaining); int bytes = file.read(buffer, offset, length); if (bytes > 0) { bytesRemaining -= bytes; } return bytes; } @Override public int available() throws IOException { return (int) Math.min(bytesRemaining, file.available()); } @Override public void close() throws IOException { file.close(); } } private static final String bashCommand = "bash"; private static final String tailCommand = "tail"; /** * Get the desired maximum length of task's logs. * @param conf the job to look in * @return the number of bytes to cap the log files at */ public static long getTaskLogLength(Configuration conf) { return conf.getLong(MRJobConfig.TASK_USERLOG_LIMIT, 0) * 1024; } /** * Wrap a command in a shell to capture stdout and stderr to files. * Setup commands such as setting memory limit can be passed which * will be executed before exec. * If the tailLength is 0, the entire output will be saved. * @param setup The setup commands for the execed process. * @param cmd The command and the arguments that should be run * @param stdoutFilename The filename that stdout should be saved to * @param stderrFilename The filename that stderr should be saved to * @param tailLength The length of the tail to be saved. * @param useSetsid Should setsid be used in the command or not. * @return the modified command that should be run */ public static List captureOutAndError(List setup, List cmd, File stdoutFilename, File stderrFilename, long tailLength, boolean useSetsid ) throws IOException { List result = new ArrayList(3); result.add(bashCommand); result.add("-c"); String mergedCmd = buildCommandLine(setup, cmd, stdoutFilename, stderrFilename, tailLength, useSetsid); result.add(mergedCmd); return result; } /** * Construct the command line for running the task JVM * @param setup The setup commands for the execed process. * @param cmd The command and the arguments that should be run * @param stdoutFilename The filename that stdout should be saved to * @param stderrFilename The filename that stderr should be saved to * @param tailLength The length of the tail to be saved. * @return the command line as a String * @throws IOException */ static String buildCommandLine(List setup, List cmd, File stdoutFilename, File stderrFilename, long tailLength, boolean useSetsid) throws IOException { String stdout = FileUtil.makeShellPath(stdoutFilename); String stderr = FileUtil.makeShellPath(stderrFilename); StringBuffer mergedCmd = new StringBuffer(); // Export the pid of taskJvm to env variable JVM_PID. // Currently pid is not used on Windows if (!Shell.WINDOWS) { mergedCmd.append(" export JVM_PID=`echo $$` ; "); } if (setup != null && setup.size() > 0) { mergedCmd.append(addCommand(setup, false)); mergedCmd.append(";"); } if (tailLength > 0) { mergedCmd.append("("); } else if(ProcessTree.isSetsidAvailable && useSetsid && !Shell.WINDOWS) { mergedCmd.append("exec setsid "); } else { mergedCmd.append("exec "); } mergedCmd.append(addCommand(cmd, true)); mergedCmd.append(" < /dev/null "); if (tailLength > 0) { mergedCmd.append(" | "); mergedCmd.append(tailCommand); mergedCmd.append(" -c "); mergedCmd.append(tailLength); mergedCmd.append(" >> "); mergedCmd.append(stdout); mergedCmd.append(" ; exit $PIPESTATUS ) 2>&1 | "); mergedCmd.append(tailCommand); mergedCmd.append(" -c "); mergedCmd.append(tailLength); mergedCmd.append(" >> "); mergedCmd.append(stderr); mergedCmd.append(" ; exit $PIPESTATUS"); } else { mergedCmd.append(" 1>> "); mergedCmd.append(stdout); mergedCmd.append(" 2>> "); mergedCmd.append(stderr); } return mergedCmd.toString(); } /** * Construct the command line for running the debug script * @param cmd The command and the arguments that should be run * @param stdoutFilename The filename that stdout should be saved to * @param stderrFilename The filename that stderr should be saved to * @param tailLength The length of the tail to be saved. * @return the command line as a String * @throws IOException */ static String buildDebugScriptCommandLine(List cmd, String debugout) throws IOException { StringBuilder mergedCmd = new StringBuilder(); mergedCmd.append("exec "); boolean isExecutable = true; for(String s: cmd) { if (isExecutable) { // the executable name needs to be expressed as a shell path for the // shell to find it. mergedCmd.append(FileUtil.makeShellPath(new File(s))); isExecutable = false; } else { mergedCmd.append(s); } mergedCmd.append(" "); } mergedCmd.append(" < /dev/null "); mergedCmd.append(" >"); mergedCmd.append(debugout); mergedCmd.append(" 2>&1 "); return mergedCmd.toString(); } /** * Add quotes to each of the command strings and * return as a single string * @param cmd The command to be quoted * @param isExecutable makes shell path if the first * argument is executable * @return returns The quoted string. * @throws IOException */ public static String addCommand(List cmd, boolean isExecutable) throws IOException { StringBuffer command = new StringBuffer(); for(String s: cmd) { command.append('\''); if (isExecutable) { // the executable name needs to be expressed as a shell path for the // shell to find it. command.append(FileUtil.makeShellPath(new File(s))); isExecutable = false; } else { command.append(s); } command.append('\''); command.append(" "); } return command.toString(); } /** * Method to return the location of user log directory. * * @return base log directory */ static File getUserLogDir() { if (!LOG_DIR.exists()) { boolean b = LOG_DIR.mkdirs(); if (!b) { LOG.debug("mkdirs failed. Ignoring."); } } return LOG_DIR; } /** * Get the user log directory for the job jobid. * * @param jobid * @return user log directory for the job */ public static File getJobDir(JobID jobid) { return new File(getUserLogDir(), jobid.toString()); } } // TaskLog ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/TaskLogAppender.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.Flushable; import java.util.LinkedList; import java.util.Queue; import org.apache.log4j.FileAppender; import org.apache.log4j.spi.LoggingEvent; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.mapreduce.TaskAttemptID; /** * A simple log4j-appender for the task child's * map-reduce system logs. * */ @InterfaceStability.Unstable public class TaskLogAppender extends FileAppender implements Flushable { private String taskId; //taskId should be managed as String rather than TaskID object //so that log4j can configure it from the configuration(log4j.properties). private Integer maxEvents; private Queue tail = null; private Boolean isCleanup; // System properties passed in from JVM runner static final String ISCLEANUP_PROPERTY = "hadoop.tasklog.iscleanup"; static final String LOGSIZE_PROPERTY = "hadoop.tasklog.totalLogFileSize"; static final String TASKID_PROPERTY = "hadoop.tasklog.taskid"; @Override public void activateOptions() { synchronized (this) { setOptionsFromSystemProperties(); if (maxEvents > 0) { tail = new LinkedList(); } setFile(TaskLog.getTaskLogFile(TaskAttemptID.forName(taskId), isCleanup, TaskLog.LogName.SYSLOG).toString()); setAppend(true); super.activateOptions(); } } /** * The Task Runner passes in the options as system properties. Set * the options if the setters haven't already been called. */ private synchronized void setOptionsFromSystemProperties() { if (isCleanup == null) { String propValue = System.getProperty(ISCLEANUP_PROPERTY, "false"); isCleanup = Boolean.valueOf(propValue); } if (taskId == null) { taskId = System.getProperty(TASKID_PROPERTY); } if (maxEvents == null) { String propValue = System.getProperty(LOGSIZE_PROPERTY, "0"); setTotalLogFileSize(Long.valueOf(propValue)); } } @Override public void append(LoggingEvent event) { synchronized (this) { if (tail == null) { super.append(event); } else { if (tail.size() >= maxEvents) { tail.remove(); } tail.add(event); } } } @Override public void flush() { if (qw != null) { qw.flush(); } } @Override public synchronized void close() { if (tail != null) { for(LoggingEvent event: tail) { super.append(event); } } super.close(); } /** * Getter/Setter methods for log4j. */ public synchronized String getTaskId() { return taskId; } public synchronized void setTaskId(String taskId) { this.taskId = taskId; } private static final int EVENT_SIZE = 100; public synchronized long getTotalLogFileSize() { return maxEvents * EVENT_SIZE; } public synchronized void setTotalLogFileSize(long logSize) { maxEvents = (int) logSize / EVENT_SIZE; } /** * Set whether the task is a cleanup attempt or not. * * @param isCleanup * true if the task is cleanup attempt, false otherwise. */ public synchronized void setIsCleanup(boolean isCleanup) { this.isCleanup = isCleanup; } /** * Get whether task is cleanup attempt or not. * * @return true if the task is cleanup attempt, false otherwise. */ public synchronized boolean getIsCleanup() { return isCleanup; } } ================================================ FILE: src/it/crs4/pydoop/mapreduce/pipes/UpwardProtocol.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; /** * The interface for the messages that can come up from the child. All of these * calls are asynchronous and return before the message has been processed. */ interface UpwardProtocol { /** * Output a record from the child. * @param key the record's key * @param value the record's value * @throws IOException */ void output(K key, V value) throws IOException, InterruptedException; /** * Map functions where the application has defined a partition function * output records along with their partition. * @param reduce the reduce to send this record to * @param key the record's key * @param value the record's value * @throws IOException */ void partitionedOutput(int reduce, K key, V value) throws IOException, InterruptedException; /** * Update the task's status message * @param msg the string to display to the user * @throws IOException */ void status(String msg) throws IOException, InterruptedException; /** * Report making progress (and the current progress) * @param progress the current progress (0.0 to 1.0) * @throws IOException */ void progress(float progress) throws IOException, InterruptedException; /** * Report that the application has finished processing all inputs * successfully. * @throws IOException */ void done() throws IOException, InterruptedException; /** * Report that the application or more likely communication failed. * @param e */ void failed(Throwable e); /** * Register a counter with the given id and group/name. * @param group counter group * @param name counter name * @throws IOException */ void registerCounter(int id, String group, String name) throws IOException; /** * Increment the value of a registered counter. * @param id counter id of the registered counter * @param amount increment for the counter value * @throws IOException */ void incrementCounter(int id, long amount) throws IOException; /** * Handles authentication response from client. * It must notify the threads waiting for authentication response. * @param digest * @return true if authentication is successful * @throws IOException */ boolean authenticate(String digest) throws IOException; } ================================================ FILE: src/libhdfs/common/htable.c ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "common/htable.h" #include #include #include #include #include struct htable_pair { void *key; void *val; }; /** * A hash table which uses linear probing. */ struct htable { uint32_t capacity; uint32_t used; htable_hash_fn_t hash_fun; htable_eq_fn_t eq_fun; struct htable_pair *elem; }; /** * An internal function for inserting a value into the hash table. * * Note: this function assumes that you have made enough space in the table. * * @param nelem The new element to insert. * @param capacity The capacity of the hash table. * @param hash_fun The hash function to use. * @param key The key to insert. * @param val The value to insert. */ static void htable_insert_internal(struct htable_pair *nelem, uint32_t capacity, htable_hash_fn_t hash_fun, void *key, void *val) { uint32_t i; i = hash_fun(key, capacity); while (1) { if (!nelem[i].key) { nelem[i].key = key; nelem[i].val = val; return; } i++; if (i == capacity) { i = 0; } } } static int htable_realloc(struct htable *htable, uint32_t new_capacity) { struct htable_pair *nelem; uint32_t i, old_capacity = htable->capacity; htable_hash_fn_t hash_fun = htable->hash_fun; nelem = calloc(new_capacity, sizeof(struct htable_pair)); if (!nelem) { return ENOMEM; } for (i = 0; i < old_capacity; i++) { struct htable_pair *pair = htable->elem + i; if (pair->key) { htable_insert_internal(nelem, new_capacity, hash_fun, pair->key, pair->val); } } free(htable->elem); htable->elem = nelem; htable->capacity = new_capacity; return 0; } static uint32_t round_up_to_power_of_2(uint32_t i) { if (i == 0) { return 1; } i--; i |= i >> 1; i |= i >> 2; i |= i >> 4; i |= i >> 8; i |= i >> 16; i++; return i; } struct htable *htable_alloc(uint32_t size, htable_hash_fn_t hash_fun, htable_eq_fn_t eq_fun) { struct htable *htable; htable = calloc(1, sizeof(*htable)); if (!htable) { return NULL; } size = round_up_to_power_of_2(size); if (size < HTABLE_MIN_SIZE) { size = HTABLE_MIN_SIZE; } htable->hash_fun = hash_fun; htable->eq_fun = eq_fun; htable->used = 0; if (htable_realloc(htable, size)) { free(htable); return NULL; } return htable; } void htable_visit(struct htable *htable, visitor_fn_t fun, void *ctx) { uint32_t i; for (i = 0; i != htable->capacity; ++i) { struct htable_pair *elem = htable->elem + i; if (elem->key) { fun(ctx, elem->key, elem->val); } } } void htable_free(struct htable *htable) { if (htable) { free(htable->elem); free(htable); } } int htable_put(struct htable *htable, void *key, void *val) { int ret; uint32_t nused; // NULL is not a valid key value. // This helps us implement htable_get_internal efficiently, since we know // that we can stop when we encounter the first NULL key. if (!key) { return EINVAL; } // NULL is not a valid value. Otherwise the results of htable_get would // be confusing (does a NULL return mean entry not found, or that the // entry was found and was NULL?) if (!val) { return EINVAL; } // Re-hash if we have used more than half of the hash table nused = htable->used + 1; if (nused >= (htable->capacity / 2)) { ret = htable_realloc(htable, htable->capacity * 2); if (ret) return ret; } htable_insert_internal(htable->elem, htable->capacity, htable->hash_fun, key, val); htable->used++; return 0; } static int htable_get_internal(const struct htable *htable, const void *key, uint32_t *out) { uint32_t start_idx, idx; start_idx = htable->hash_fun(key, htable->capacity); idx = start_idx; while (1) { struct htable_pair *pair = htable->elem + idx; if (!pair->key) { // We always maintain the invariant that the entries corresponding // to a given key are stored in a contiguous block, not separated // by any NULLs. So if we encounter a NULL, our search is over. return ENOENT; } else if (htable->eq_fun(pair->key, key)) { *out = idx; return 0; } idx++; if (idx == htable->capacity) { idx = 0; } if (idx == start_idx) { return ENOENT; } } } void *htable_get(const struct htable *htable, const void *key) { uint32_t idx; if (htable_get_internal(htable, key, &idx)) { return NULL; } return htable->elem[idx].val; } void htable_pop(struct htable *htable, const void *key, void **found_key, void **found_val) { uint32_t hole, i; const void *nkey; if (htable_get_internal(htable, key, &hole)) { *found_key = NULL; *found_val = NULL; return; } i = hole; htable->used--; // We need to maintain the compactness invariant used in // htable_get_internal. This invariant specifies that the entries for any // given key are never separated by NULLs (although they may be separated // by entries for other keys.) while (1) { i++; if (i == htable->capacity) { i = 0; } nkey = htable->elem[i].key; if (!nkey) { *found_key = htable->elem[hole].key; *found_val = htable->elem[hole].val; htable->elem[hole].key = NULL; htable->elem[hole].val = NULL; return; } else if (htable->eq_fun(key, nkey)) { htable->elem[hole].key = htable->elem[i].key; htable->elem[hole].val = htable->elem[i].val; hole = i; } } } uint32_t htable_used(const struct htable *htable) { return htable->used; } uint32_t htable_capacity(const struct htable *htable) { return htable->capacity; } uint32_t ht_hash_string(const void *str, uint32_t max) { const char *s = str; uint32_t hash = 0; while (*s) { hash = (hash * 31) + *s; s++; } return hash % max; } int ht_compare_string(const void *a, const void *b) { return strcmp(a, b) == 0; } // vim: ts=4:sw=4:tw=79:et ================================================ FILE: src/libhdfs/common/htable.h ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef HADOOP_CORE_COMMON_HASH_TABLE #define HADOOP_CORE_COMMON_HASH_TABLE #include #include #include #define HTABLE_MIN_SIZE 4 struct htable; /** * An HTable hash function. * * @param key The key. * @param capacity The total capacity. * * @return The hash slot. Must be less than the capacity. */ typedef uint32_t (*htable_hash_fn_t)(const void *key, uint32_t capacity); /** * An HTable equality function. Compares two keys. * * @param a First key. * @param b Second key. * * @return nonzero if the keys are equal. */ typedef int (*htable_eq_fn_t)(const void *a, const void *b); /** * Allocate a new hash table. * * @param capacity The minimum suggested starting capacity. * @param hash_fun The hash function to use in this hash table. * @param eq_fun The equals function to use in this hash table. * * @return The new hash table on success; NULL on OOM. */ struct htable *htable_alloc(uint32_t capacity, htable_hash_fn_t hash_fun, htable_eq_fn_t eq_fun); typedef void (*visitor_fn_t)(void *ctx, void *key, void *val); /** * Visit all of the entries in the hash table. * * @param htable The hash table. * @param fun The callback function to invoke on each key and value. * @param ctx Context pointer to pass to the callback. */ void htable_visit(struct htable *htable, visitor_fn_t fun, void *ctx); /** * Free the hash table. * * It is up the calling code to ensure that the keys and values inside the * table are de-allocated, if that is necessary. * * @param htable The hash table. */ void htable_free(struct htable *htable); /** * Add an entry to the hash table. * * @param htable The hash table. * @param key The key to add. This cannot be NULL. * @param fun The value to add. This cannot be NULL. * * @return 0 on success; * EEXIST if the value already exists in the table; * ENOMEM if there is not enough memory to add the element. * EFBIG if the hash table has too many entries to fit in 32 * bits. */ int htable_put(struct htable *htable, void *key, void *val); /** * Get an entry from the hash table. * * @param htable The hash table. * @param key The key to find. * * @return NULL if there is no such entry; the entry otherwise. */ void *htable_get(const struct htable *htable, const void *key); /** * Get an entry from the hash table and remove it. * * @param htable The hash table. * @param key The key for the entry find and remove. * @param found_key (out param) NULL if the entry was not found; the found key * otherwise. * @param found_val (out param) NULL if the entry was not found; the found * value otherwise. */ void htable_pop(struct htable *htable, const void *key, void **found_key, void **found_val); /** * Get the number of entries used in the hash table. * * @param htable The hash table. * * @return The number of entries used in the hash table. */ uint32_t htable_used(const struct htable *htable); /** * Get the capacity of the hash table. * * @param htable The hash table. * * @return The capacity of the hash table. */ uint32_t htable_capacity(const struct htable *htable); /** * Hash a string. * * @param str The string. * @param max Maximum hash value * * @return A number less than max. */ uint32_t ht_hash_string(const void *str, uint32_t max); /** * Compare two strings. * * @param a The first string. * @param b The second string. * * @return 1 if the strings are identical; 0 otherwise. */ int ht_compare_string(const void *a, const void *b); #endif // vim: ts=4:sw=4:tw=79:et ================================================ FILE: src/libhdfs/config.h ================================================ #ifndef CONFIG_H #define CONFIG_H #endif ================================================ FILE: src/libhdfs/exception.c ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "exception.h" #include "hdfs/hdfs.h" #include "jni_helper.h" #include "platform.h" #include #include #include #define EXCEPTION_INFO_LEN (sizeof(gExceptionInfo)/sizeof(gExceptionInfo[0])) struct ExceptionInfo { const char * const name; int noPrintFlag; int excErrno; }; static const struct ExceptionInfo gExceptionInfo[] = { { "java.io.FileNotFoundException", NOPRINT_EXC_FILE_NOT_FOUND, ENOENT, }, { "org.apache.hadoop.security.AccessControlException", NOPRINT_EXC_ACCESS_CONTROL, EACCES, }, { "org.apache.hadoop.fs.UnresolvedLinkException", NOPRINT_EXC_UNRESOLVED_LINK, ENOLINK, }, { "org.apache.hadoop.fs.ParentNotDirectoryException", NOPRINT_EXC_PARENT_NOT_DIRECTORY, ENOTDIR, }, { "java.lang.IllegalArgumentException", NOPRINT_EXC_ILLEGAL_ARGUMENT, EINVAL, }, { "java.lang.OutOfMemoryError", 0, ENOMEM, }, { "org.apache.hadoop.hdfs.server.namenode.SafeModeException", 0, EROFS, }, { "org.apache.hadoop.fs.FileAlreadyExistsException", 0, EEXIST, }, { "org.apache.hadoop.hdfs.protocol.QuotaExceededException", 0, EDQUOT, }, { "java.lang.UnsupportedOperationException", 0, ENOTSUP, }, { "org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException", 0, ESTALE, }, }; void getExceptionInfo(const char *excName, int noPrintFlags, int *excErrno, int *shouldPrint) { int i; for (i = 0; i < EXCEPTION_INFO_LEN; i++) { if (strstr(gExceptionInfo[i].name, excName)) { break; } } if (i < EXCEPTION_INFO_LEN) { *shouldPrint = !(gExceptionInfo[i].noPrintFlag & noPrintFlags); *excErrno = gExceptionInfo[i].excErrno; } else { *shouldPrint = 1; *excErrno = EINTERNAL; } } int printExceptionAndFreeV(JNIEnv *env, jthrowable exc, int noPrintFlags, const char *fmt, va_list ap) { int i, noPrint, excErrno; char *className = NULL; jstring jStr = NULL; jvalue jVal; jthrowable jthr; const char *stackTrace; jthr = classNameOfObject(exc, env, &className); if (jthr) { fprintf(stderr, "PrintExceptionAndFree: error determining class name " "of exception.\n"); className = strdup("(unknown)"); destroyLocalReference(env, jthr); } for (i = 0; i < EXCEPTION_INFO_LEN; i++) { if (!strcmp(gExceptionInfo[i].name, className)) { break; } } if (i < EXCEPTION_INFO_LEN) { noPrint = (gExceptionInfo[i].noPrintFlag & noPrintFlags); excErrno = gExceptionInfo[i].excErrno; } else { noPrint = 0; excErrno = EINTERNAL; } if (!noPrint) { vfprintf(stderr, fmt, ap); fprintf(stderr, " error:\n"); // We don't want to use ExceptionDescribe here, because that requires a // pending exception. Instead, use ExceptionUtils. jthr = invokeMethod(env, &jVal, STATIC, NULL, "org/apache/commons/lang/exception/ExceptionUtils", "getStackTrace", "(Ljava/lang/Throwable;)Ljava/lang/String;", exc); if (jthr) { fprintf(stderr, "(unable to get stack trace for %s exception: " "ExceptionUtils::getStackTrace error.)\n", className); destroyLocalReference(env, jthr); } else { jStr = jVal.l; stackTrace = (*env)->GetStringUTFChars(env, jStr, NULL); if (!stackTrace) { fprintf(stderr, "(unable to get stack trace for %s exception: " "GetStringUTFChars error.)\n", className); } else { fprintf(stderr, "%s", stackTrace); (*env)->ReleaseStringUTFChars(env, jStr, stackTrace); } } } destroyLocalReference(env, jStr); destroyLocalReference(env, exc); free(className); return excErrno; } int printExceptionAndFree(JNIEnv *env, jthrowable exc, int noPrintFlags, const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = printExceptionAndFreeV(env, exc, noPrintFlags, fmt, ap); va_end(ap); return ret; } int printPendingExceptionAndFree(JNIEnv *env, int noPrintFlags, const char *fmt, ...) { va_list ap; int ret; jthrowable exc; exc = (*env)->ExceptionOccurred(env); if (!exc) { va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fprintf(stderr, " error: (no exception)"); ret = 0; } else { (*env)->ExceptionClear(env); va_start(ap, fmt); ret = printExceptionAndFreeV(env, exc, noPrintFlags, fmt, ap); va_end(ap); } return ret; } jthrowable getPendingExceptionAndClear(JNIEnv *env) { jthrowable jthr = (*env)->ExceptionOccurred(env); if (!jthr) return NULL; (*env)->ExceptionClear(env); return jthr; } jthrowable newRuntimeError(JNIEnv *env, const char *fmt, ...) { char buf[512]; jobject out, exc; jstring jstr; va_list ap; va_start(ap, fmt); vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); jstr = (*env)->NewStringUTF(env, buf); if (!jstr) { // We got an out of memory exception rather than a RuntimeException. // Too bad... return getPendingExceptionAndClear(env); } exc = constructNewObjectOfClass(env, &out, "RuntimeException", "(java/lang/String;)V", jstr); (*env)->DeleteLocalRef(env, jstr); // Again, we'll either get an out of memory exception or the // RuntimeException we wanted. return (exc) ? exc : out; } ================================================ FILE: src/libhdfs/exception.h ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBHDFS_EXCEPTION_H #define LIBHDFS_EXCEPTION_H /** * Exception handling routines for libhdfs. * * The convention we follow here is to clear pending exceptions as soon as they * are raised. Never assume that the caller of your function will clean up * after you-- do it yourself. Unhandled exceptions can lead to memory leaks * and other undefined behavior. * * If you encounter an exception, return a local reference to it. The caller is * responsible for freeing the local reference, by calling a function like * PrintExceptionAndFree. (You can also free exceptions directly by calling * DeleteLocalRef. However, that would not produce an error message, so it's * usually not what you want.) */ #include "platform.h" #include #include #include #include #include #include /** * Exception noprint flags * * Theses flags determine which exceptions should NOT be printed to stderr by * the exception printing routines. For example, if you expect to see * FileNotFound, you might use NOPRINT_EXC_FILE_NOT_FOUND, to avoid filling the * logs with messages about routine events. * * On the other hand, if you don't expect any failures, you might pass * PRINT_EXC_ALL. * * You can OR these flags together to avoid printing multiple classes of * exceptions. */ #define PRINT_EXC_ALL 0x00 #define NOPRINT_EXC_FILE_NOT_FOUND 0x01 #define NOPRINT_EXC_ACCESS_CONTROL 0x02 #define NOPRINT_EXC_UNRESOLVED_LINK 0x04 #define NOPRINT_EXC_PARENT_NOT_DIRECTORY 0x08 #define NOPRINT_EXC_ILLEGAL_ARGUMENT 0x10 /** * Get information about an exception. * * @param excName The Exception name. * This is a Java class name in JNI format. * @param noPrintFlags Flags which determine which exceptions we should NOT * print. * @param excErrno (out param) The POSIX error number associated with the * exception. * @param shouldPrint (out param) Nonzero if we should print this exception, * based on the noPrintFlags and its name. */ void getExceptionInfo(const char *excName, int noPrintFlags, int *excErrno, int *shouldPrint); /** * Print out information about an exception and free it. * * @param env The JNI environment * @param exc The exception to print and free * @param noPrintFlags Flags which determine which exceptions we should NOT * print. * @param fmt Printf-style format list * @param ap Printf-style varargs * * @return The POSIX error number associated with the exception * object. */ int printExceptionAndFreeV(JNIEnv *env, jthrowable exc, int noPrintFlags, const char *fmt, va_list ap); /** * Print out information about an exception and free it. * * @param env The JNI environment * @param exc The exception to print and free * @param noPrintFlags Flags which determine which exceptions we should NOT * print. * @param fmt Printf-style format list * @param ... Printf-style varargs * * @return The POSIX error number associated with the exception * object. */ int printExceptionAndFree(JNIEnv *env, jthrowable exc, int noPrintFlags, const char *fmt, ...) TYPE_CHECKED_PRINTF_FORMAT(4, 5); /** * Print out information about the pending exception and free it. * * @param env The JNI environment * @param noPrintFlags Flags which determine which exceptions we should NOT * print. * @param fmt Printf-style format list * @param ... Printf-style varargs * * @return The POSIX error number associated with the exception * object. */ int printPendingExceptionAndFree(JNIEnv *env, int noPrintFlags, const char *fmt, ...) TYPE_CHECKED_PRINTF_FORMAT(3, 4); /** * Get a local reference to the pending exception and clear it. * * Once it is cleared, the exception will no longer be pending. The caller will * have to decide what to do with the exception object. * * @param env The JNI environment * * @return The exception, or NULL if there was no exception */ jthrowable getPendingExceptionAndClear(JNIEnv *env); /** * Create a new runtime error. * * This creates (but does not throw) a new RuntimeError. * * @param env The JNI environment * @param fmt Printf-style format list * @param ... Printf-style varargs * * @return A local reference to a RuntimeError */ jthrowable newRuntimeError(JNIEnv *env, const char *fmt, ...) TYPE_CHECKED_PRINTF_FORMAT(2, 3); #undef TYPE_CHECKED_PRINTF_FORMAT #endif ================================================ FILE: src/libhdfs/hdfs.c ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "exception.h" #include "hdfs/hdfs.h" #include "jni_helper.h" #include "platform.h" #include #include #include #include /* Some frequently used Java paths */ #define HADOOP_CONF "org/apache/hadoop/conf/Configuration" #define HADOOP_PATH "org/apache/hadoop/fs/Path" #define HADOOP_LOCALFS "org/apache/hadoop/fs/LocalFileSystem" #define HADOOP_FS "org/apache/hadoop/fs/FileSystem" #define HADOOP_FSSTATUS "org/apache/hadoop/fs/FsStatus" #define HADOOP_BLK_LOC "org/apache/hadoop/fs/BlockLocation" #define HADOOP_DFS "org/apache/hadoop/hdfs/DistributedFileSystem" #define HADOOP_ISTRM "org/apache/hadoop/fs/FSDataInputStream" #define HADOOP_OSTRM "org/apache/hadoop/fs/FSDataOutputStream" #define HADOOP_STAT "org/apache/hadoop/fs/FileStatus" #define HADOOP_FSPERM "org/apache/hadoop/fs/permission/FsPermission" #define JAVA_NET_ISA "java/net/InetSocketAddress" #define JAVA_NET_URI "java/net/URI" #define JAVA_STRING "java/lang/String" #define READ_OPTION "org/apache/hadoop/fs/ReadOption" #define JAVA_VOID "V" /* Macros for constructing method signatures */ #define JPARAM(X) "L" X ";" #define JARRPARAM(X) "[L" X ";" #define JMETHOD1(X, R) "(" X ")" R #define JMETHOD2(X, Y, R) "(" X Y ")" R #define JMETHOD3(X, Y, Z, R) "(" X Y Z")" R #define KERBEROS_TICKET_CACHE_PATH "hadoop.security.kerberos.ticket.cache.path" // Bit fields for hdfsFile_internal flags #define HDFS_FILE_SUPPORTS_DIRECT_READ (1<<0) tSize readDirect(hdfsFS fs, hdfsFile f, void* buffer, tSize length); static void hdfsFreeFileInfoEntry(hdfsFileInfo *hdfsFileInfo); /** * The C equivalent of org.apache.org.hadoop.FSData(Input|Output)Stream . */ enum hdfsStreamType { HDFS_STREAM_UNINITIALIZED = 0, HDFS_STREAM_INPUT = 1, HDFS_STREAM_OUTPUT = 2, }; /** * The 'file-handle' to a file in hdfs. */ struct hdfsFile_internal { void* file; enum hdfsStreamType type; int flags; }; #define HDFS_EXTENDED_FILE_INFO_ENCRYPTED 0x1 /** * Extended file information. */ struct hdfsExtendedFileInfo { int flags; }; int hdfsFileIsOpenForRead(hdfsFile file) { return (file->type == HDFS_STREAM_INPUT); } int hdfsFileGetReadStatistics(hdfsFile file, struct hdfsReadStatistics **stats) { jthrowable jthr; jobject readStats = NULL; jvalue jVal; struct hdfsReadStatistics *s = NULL; int ret; JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } if (file->type != HDFS_STREAM_INPUT) { ret = EINVAL; goto done; } jthr = invokeMethod(env, &jVal, INSTANCE, file->file, "org/apache/hadoop/hdfs/client/HdfsDataInputStream", "getReadStatistics", "()Lorg/apache/hadoop/hdfs/DFSInputStream$ReadStatistics;"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsFileGetReadStatistics: getReadStatistics failed"); goto done; } readStats = jVal.l; s = malloc(sizeof(struct hdfsReadStatistics)); if (!s) { ret = ENOMEM; goto done; } jthr = invokeMethod(env, &jVal, INSTANCE, readStats, "org/apache/hadoop/hdfs/DFSInputStream$ReadStatistics", "getTotalBytesRead", "()J"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsFileGetReadStatistics: getTotalBytesRead failed"); goto done; } s->totalBytesRead = jVal.j; jthr = invokeMethod(env, &jVal, INSTANCE, readStats, "org/apache/hadoop/hdfs/DFSInputStream$ReadStatistics", "getTotalLocalBytesRead", "()J"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsFileGetReadStatistics: getTotalLocalBytesRead failed"); goto done; } s->totalLocalBytesRead = jVal.j; jthr = invokeMethod(env, &jVal, INSTANCE, readStats, "org/apache/hadoop/hdfs/DFSInputStream$ReadStatistics", "getTotalShortCircuitBytesRead", "()J"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsFileGetReadStatistics: getTotalShortCircuitBytesRead failed"); goto done; } s->totalShortCircuitBytesRead = jVal.j; jthr = invokeMethod(env, &jVal, INSTANCE, readStats, "org/apache/hadoop/hdfs/DFSInputStream$ReadStatistics", "getTotalZeroCopyBytesRead", "()J"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsFileGetReadStatistics: getTotalZeroCopyBytesRead failed"); goto done; } s->totalZeroCopyBytesRead = jVal.j; *stats = s; s = NULL; ret = 0; done: destroyLocalReference(env, readStats); free(s); if (ret) { errno = ret; return -1; } return 0; } int64_t hdfsReadStatisticsGetRemoteBytesRead( const struct hdfsReadStatistics *stats) { return stats->totalBytesRead - stats->totalLocalBytesRead; } int hdfsFileClearReadStatistics(hdfsFile file) { jthrowable jthr; int ret; JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return EINTERNAL; } if (file->type != HDFS_STREAM_INPUT) { ret = EINVAL; goto done; } jthr = invokeMethod(env, NULL, INSTANCE, file->file, "org/apache/hadoop/hdfs/client/HdfsDataInputStream", "clearReadStatistics", "()V"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsFileClearReadStatistics: clearReadStatistics failed"); goto done; } ret = 0; done: if (ret) { errno = ret; return ret; } return 0; } void hdfsFileFreeReadStatistics(struct hdfsReadStatistics *stats) { free(stats); } int hdfsFileIsOpenForWrite(hdfsFile file) { return (file->type == HDFS_STREAM_OUTPUT); } int hdfsFileUsesDirectRead(hdfsFile file) { return !!(file->flags & HDFS_FILE_SUPPORTS_DIRECT_READ); } void hdfsFileDisableDirectRead(hdfsFile file) { file->flags &= ~HDFS_FILE_SUPPORTS_DIRECT_READ; } int hdfsDisableDomainSocketSecurity(void) { jthrowable jthr; JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } jthr = invokeMethod(env, NULL, STATIC, NULL, "org/apache/hadoop/net/unix/DomainSocket", "disableBindPathValidation", "()V"); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "DomainSocket#disableBindPathValidation"); return -1; } return 0; } /** * hdfsJniEnv: A wrapper struct to be used as 'value' * while saving thread -> JNIEnv* mappings */ typedef struct { JNIEnv* env; } hdfsJniEnv; /** * Helper function to create a org.apache.hadoop.fs.Path object. * @param env: The JNIEnv pointer. * @param path: The file-path for which to construct org.apache.hadoop.fs.Path * object. * @return Returns a jobject on success and NULL on error. */ static jthrowable constructNewObjectOfPath(JNIEnv *env, const char *path, jobject *out) { jthrowable jthr; jstring jPathString; jobject jPath; //Construct a java.lang.String object jthr = newJavaStr(env, path, &jPathString); if (jthr) return jthr; //Construct the org.apache.hadoop.fs.Path object jthr = constructNewObjectOfClass(env, &jPath, "org/apache/hadoop/fs/Path", "(Ljava/lang/String;)V", jPathString); destroyLocalReference(env, jPathString); if (jthr) return jthr; *out = jPath; return NULL; } static jthrowable hadoopConfGetStr(JNIEnv *env, jobject jConfiguration, const char *key, char **val) { jthrowable jthr; jvalue jVal; jstring jkey = NULL, jRet = NULL; jthr = newJavaStr(env, key, &jkey); if (jthr) goto done; jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration, HADOOP_CONF, "get", JMETHOD1(JPARAM(JAVA_STRING), JPARAM(JAVA_STRING)), jkey); if (jthr) goto done; jRet = jVal.l; jthr = newCStr(env, jRet, val); done: destroyLocalReference(env, jkey); destroyLocalReference(env, jRet); return jthr; } int hdfsConfGetStr(const char *key, char **val) { JNIEnv *env; int ret; jthrowable jthr; jobject jConfiguration = NULL; env = getJNIEnv(); if (env == NULL) { ret = EINTERNAL; goto done; } jthr = constructNewObjectOfClass(env, &jConfiguration, HADOOP_CONF, "()V"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsConfGetStr(%s): new Configuration", key); goto done; } jthr = hadoopConfGetStr(env, jConfiguration, key, val); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsConfGetStr(%s): hadoopConfGetStr", key); goto done; } ret = 0; done: destroyLocalReference(env, jConfiguration); if (ret) errno = ret; return ret; } void hdfsConfStrFree(char *val) { free(val); } static jthrowable hadoopConfGetInt(JNIEnv *env, jobject jConfiguration, const char *key, int32_t *val) { jthrowable jthr = NULL; jvalue jVal; jstring jkey = NULL; jthr = newJavaStr(env, key, &jkey); if (jthr) return jthr; jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration, HADOOP_CONF, "getInt", JMETHOD2(JPARAM(JAVA_STRING), "I", "I"), jkey, (jint)(*val)); destroyLocalReference(env, jkey); if (jthr) return jthr; *val = jVal.i; return NULL; } int hdfsConfGetInt(const char *key, int32_t *val) { JNIEnv *env; int ret; jobject jConfiguration = NULL; jthrowable jthr; env = getJNIEnv(); if (env == NULL) { ret = EINTERNAL; goto done; } jthr = constructNewObjectOfClass(env, &jConfiguration, HADOOP_CONF, "()V"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsConfGetInt(%s): new Configuration", key); goto done; } jthr = hadoopConfGetInt(env, jConfiguration, key, val); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsConfGetInt(%s): hadoopConfGetInt", key); goto done; } ret = 0; done: destroyLocalReference(env, jConfiguration); if (ret) errno = ret; return ret; } struct hdfsBuilderConfOpt { struct hdfsBuilderConfOpt *next; const char *key; const char *val; }; struct hdfsBuilder { int forceNewInstance; const char *nn; tPort port; const char *kerbTicketCachePath; const char *userName; struct hdfsBuilderConfOpt *opts; }; struct hdfsBuilder *hdfsNewBuilder(void) { struct hdfsBuilder *bld = calloc(1, sizeof(struct hdfsBuilder)); if (!bld) { errno = ENOMEM; return NULL; } return bld; } int hdfsBuilderConfSetStr(struct hdfsBuilder *bld, const char *key, const char *val) { struct hdfsBuilderConfOpt *opt, *next; opt = calloc(1, sizeof(struct hdfsBuilderConfOpt)); if (!opt) return -ENOMEM; next = bld->opts; bld->opts = opt; opt->next = next; opt->key = key; opt->val = val; return 0; } void hdfsFreeBuilder(struct hdfsBuilder *bld) { struct hdfsBuilderConfOpt *cur, *next; cur = bld->opts; for (cur = bld->opts; cur; ) { next = cur->next; free(cur); cur = next; } free(bld); } void hdfsBuilderSetForceNewInstance(struct hdfsBuilder *bld) { bld->forceNewInstance = 1; } void hdfsBuilderSetNameNode(struct hdfsBuilder *bld, const char *nn) { bld->nn = nn; } void hdfsBuilderSetNameNodePort(struct hdfsBuilder *bld, tPort port) { bld->port = port; } void hdfsBuilderSetUserName(struct hdfsBuilder *bld, const char *userName) { bld->userName = userName; } void hdfsBuilderSetKerbTicketCachePath(struct hdfsBuilder *bld, const char *kerbTicketCachePath) { bld->kerbTicketCachePath = kerbTicketCachePath; } hdfsFS hdfsConnect(const char *host, tPort port) { struct hdfsBuilder *bld = hdfsNewBuilder(); if (!bld) return NULL; hdfsBuilderSetNameNode(bld, host); hdfsBuilderSetNameNodePort(bld, port); return hdfsBuilderConnect(bld); } /** Always return a new FileSystem handle */ hdfsFS hdfsConnectNewInstance(const char *host, tPort port) { struct hdfsBuilder *bld = hdfsNewBuilder(); if (!bld) return NULL; hdfsBuilderSetNameNode(bld, host); hdfsBuilderSetNameNodePort(bld, port); hdfsBuilderSetForceNewInstance(bld); return hdfsBuilderConnect(bld); } hdfsFS hdfsConnectAsUser(const char *host, tPort port, const char *user) { struct hdfsBuilder *bld = hdfsNewBuilder(); if (!bld) return NULL; hdfsBuilderSetNameNode(bld, host); hdfsBuilderSetNameNodePort(bld, port); hdfsBuilderSetUserName(bld, user); return hdfsBuilderConnect(bld); } /** Always return a new FileSystem handle */ hdfsFS hdfsConnectAsUserNewInstance(const char *host, tPort port, const char *user) { struct hdfsBuilder *bld = hdfsNewBuilder(); if (!bld) return NULL; hdfsBuilderSetNameNode(bld, host); hdfsBuilderSetNameNodePort(bld, port); hdfsBuilderSetForceNewInstance(bld); hdfsBuilderSetUserName(bld, user); return hdfsBuilderConnect(bld); } /** * Calculate the effective URI to use, given a builder configuration. * * If there is not already a URI scheme, we prepend 'hdfs://'. * * If there is not already a port specified, and a port was given to the * builder, we suffix that port. If there is a port specified but also one in * the URI, that is an error. * * @param bld The hdfs builder object * @param uri (out param) dynamically allocated string representing the * effective URI * * @return 0 on success; error code otherwise */ static int calcEffectiveURI(struct hdfsBuilder *bld, char ** uri) { const char *scheme; char suffix[64]; const char *lastColon; char *u; size_t uriLen; if (!bld->nn) return EINVAL; scheme = (strstr(bld->nn, "://")) ? "" : "hdfs://"; if (bld->port == 0) { suffix[0] = '\0'; } else { lastColon = strrchr(bld->nn, ':'); if (lastColon && (strspn(lastColon + 1, "0123456789") == strlen(lastColon + 1))) { fprintf(stderr, "port %d was given, but URI '%s' already " "contains a port!\n", bld->port, bld->nn); return EINVAL; } snprintf(suffix, sizeof(suffix), ":%d", bld->port); } uriLen = strlen(scheme) + strlen(bld->nn) + strlen(suffix); u = malloc((uriLen + 1) * (sizeof(char))); if (!u) { fprintf(stderr, "calcEffectiveURI: out of memory"); return ENOMEM; } snprintf(u, uriLen + 1, "%s%s%s", scheme, bld->nn, suffix); *uri = u; return 0; } static const char *maybeNull(const char *str) { return str ? str : "(NULL)"; } static const char *hdfsBuilderToStr(const struct hdfsBuilder *bld, char *buf, size_t bufLen) { snprintf(buf, bufLen, "forceNewInstance=%d, nn=%s, port=%d, " "kerbTicketCachePath=%s, userName=%s", bld->forceNewInstance, maybeNull(bld->nn), bld->port, maybeNull(bld->kerbTicketCachePath), maybeNull(bld->userName)); return buf; } hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld) { JNIEnv *env = 0; jobject jConfiguration = NULL, jFS = NULL, jURI = NULL, jCachePath = NULL; jstring jURIString = NULL, jUserString = NULL; jvalue jVal; jthrowable jthr = NULL; char *cURI = 0, buf[512]; int ret; jobject jRet = NULL; struct hdfsBuilderConfOpt *opt; //Get the JNIEnv* corresponding to current thread env = getJNIEnv(); if (env == NULL) { ret = EINTERNAL; goto done; } // jConfiguration = new Configuration(); jthr = constructNewObjectOfClass(env, &jConfiguration, HADOOP_CONF, "()V"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf))); goto done; } // set configuration values for (opt = bld->opts; opt; opt = opt->next) { jthr = hadoopConfSetStr(env, jConfiguration, opt->key, opt->val); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsBuilderConnect(%s): error setting conf '%s' to '%s'", hdfsBuilderToStr(bld, buf, sizeof(buf)), opt->key, opt->val); goto done; } } //Check what type of FileSystem the caller wants... if (bld->nn == NULL) { // Get a local filesystem. if (bld->forceNewInstance) { // fs = FileSytem#newInstanceLocal(conf); jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS, "newInstanceLocal", JMETHOD1(JPARAM(HADOOP_CONF), JPARAM(HADOOP_LOCALFS)), jConfiguration); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf))); goto done; } jFS = jVal.l; } else { // fs = FileSytem#getLocal(conf); jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS, "getLocal", JMETHOD1(JPARAM(HADOOP_CONF), JPARAM(HADOOP_LOCALFS)), jConfiguration); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf))); goto done; } jFS = jVal.l; } } else { if (!strcmp(bld->nn, "default")) { // jURI = FileSystem.getDefaultUri(conf) jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS, "getDefaultUri", "(Lorg/apache/hadoop/conf/Configuration;)Ljava/net/URI;", jConfiguration); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf))); goto done; } jURI = jVal.l; } else { // fs = FileSystem#get(URI, conf, ugi); ret = calcEffectiveURI(bld, &cURI); if (ret) goto done; jthr = newJavaStr(env, cURI, &jURIString); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf))); goto done; } jthr = invokeMethod(env, &jVal, STATIC, NULL, JAVA_NET_URI, "create", "(Ljava/lang/String;)Ljava/net/URI;", jURIString); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf))); goto done; } jURI = jVal.l; } if (bld->kerbTicketCachePath) { jthr = hadoopConfSetStr(env, jConfiguration, KERBEROS_TICKET_CACHE_PATH, bld->kerbTicketCachePath); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf))); goto done; } } jthr = newJavaStr(env, bld->userName, &jUserString); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf))); goto done; } if (bld->forceNewInstance) { jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS, "newInstance", JMETHOD3(JPARAM(JAVA_NET_URI), JPARAM(HADOOP_CONF), JPARAM(JAVA_STRING), JPARAM(HADOOP_FS)), jURI, jConfiguration, jUserString); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf))); goto done; } jFS = jVal.l; } else { jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS, "get", JMETHOD3(JPARAM(JAVA_NET_URI), JPARAM(HADOOP_CONF), JPARAM(JAVA_STRING), JPARAM(HADOOP_FS)), jURI, jConfiguration, jUserString); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf))); goto done; } jFS = jVal.l; } } jRet = (*env)->NewGlobalRef(env, jFS); if (!jRet) { ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf))); goto done; } ret = 0; done: // Release unnecessary local references destroyLocalReference(env, jConfiguration); destroyLocalReference(env, jFS); destroyLocalReference(env, jURI); destroyLocalReference(env, jCachePath); destroyLocalReference(env, jURIString); destroyLocalReference(env, jUserString); free(cURI); hdfsFreeBuilder(bld); if (ret) { errno = ret; return NULL; } return (hdfsFS)jRet; } int hdfsDisconnect(hdfsFS fs) { // JAVA EQUIVALENT: // fs.close() //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); int ret; jobject jFS; jthrowable jthr; if (env == NULL) { errno = EINTERNAL; return -1; } //Parameters jFS = (jobject)fs; //Sanity check if (fs == NULL) { errno = EBADF; return -1; } jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS, "close", "()V"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsDisconnect: FileSystem#close"); } else { ret = 0; } (*env)->DeleteGlobalRef(env, jFS); if (ret) { errno = ret; return -1; } return 0; } /** * Get the default block size of a FileSystem object. * * @param env The Java env * @param jFS The FileSystem object * @param jPath The path to find the default blocksize at * @param out (out param) the default block size * * @return NULL on success; or the exception */ static jthrowable getDefaultBlockSize(JNIEnv *env, jobject jFS, jobject jPath, jlong *out) { jthrowable jthr; jvalue jVal; jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "getDefaultBlockSize", JMETHOD1(JPARAM(HADOOP_PATH), "J"), jPath); if (jthr) return jthr; *out = jVal.j; return NULL; } hdfsFile hdfsOpenFile(hdfsFS fs, const char *path, int flags, int bufferSize, short replication, tSize blockSize) { /* JAVA EQUIVALENT: File f = new File(path); FSData{Input|Output}Stream f{is|os} = fs.create(f); return f{is|os}; */ int accmode = flags & O_ACCMODE; jstring jStrBufferSize = NULL, jStrReplication = NULL; jobject jConfiguration = NULL, jPath = NULL, jFile = NULL; jobject jFS = (jobject)fs; jthrowable jthr; jvalue jVal; hdfsFile file = NULL; int ret; jint jBufferSize = bufferSize; jshort jReplication = replication; /* The hadoop java api/signature */ const char *method = NULL; const char *signature = NULL; /* Get the JNIEnv* corresponding to current thread */ JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return NULL; } if (accmode == O_RDONLY || accmode == O_WRONLY) { /* yay */ } else if (accmode == O_RDWR) { fprintf(stderr, "ERROR: cannot open an hdfs file in O_RDWR mode\n"); errno = ENOTSUP; return NULL; } else { fprintf(stderr, "ERROR: cannot open an hdfs file in mode 0x%x\n", accmode); errno = EINVAL; return NULL; } if ((flags & O_CREAT) && (flags & O_EXCL)) { fprintf(stderr, "WARN: hdfs does not truly support O_CREATE && O_EXCL\n"); } if (accmode == O_RDONLY) { method = "open"; signature = JMETHOD2(JPARAM(HADOOP_PATH), "I", JPARAM(HADOOP_ISTRM)); } else if (flags & O_APPEND) { method = "append"; signature = JMETHOD1(JPARAM(HADOOP_PATH), JPARAM(HADOOP_OSTRM)); } else { method = "create"; signature = JMETHOD2(JPARAM(HADOOP_PATH), "ZISJ", JPARAM(HADOOP_OSTRM)); } /* Create an object of org.apache.hadoop.fs.Path */ jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsOpenFile(%s): constructNewObjectOfPath", path); goto done; } /* Get the Configuration object from the FileSystem object */ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "getConf", JMETHOD1("", JPARAM(HADOOP_CONF))); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsOpenFile(%s): FileSystem#getConf", path); goto done; } jConfiguration = jVal.l; jStrBufferSize = (*env)->NewStringUTF(env, "io.file.buffer.size"); if (!jStrBufferSize) { ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "OOM"); goto done; } jStrReplication = (*env)->NewStringUTF(env, "dfs.replication"); if (!jStrReplication) { ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "OOM"); goto done; } if (!bufferSize) { jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration, HADOOP_CONF, "getInt", "(Ljava/lang/String;I)I", jStrBufferSize, 4096); if (jthr) { ret = printExceptionAndFree(env, jthr, NOPRINT_EXC_FILE_NOT_FOUND | NOPRINT_EXC_ACCESS_CONTROL | NOPRINT_EXC_UNRESOLVED_LINK, "hdfsOpenFile(%s): Configuration#getInt(io.file.buffer.size)", path); goto done; } jBufferSize = jVal.i; } if ((accmode == O_WRONLY) && (flags & O_APPEND) == 0) { if (!replication) { jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration, HADOOP_CONF, "getInt", "(Ljava/lang/String;I)I", jStrReplication, 1); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsOpenFile(%s): Configuration#getInt(dfs.replication)", path); goto done; } jReplication = (jshort)jVal.i; } } /* Create and return either the FSDataInputStream or FSDataOutputStream references jobject jStream */ // READ? if (accmode == O_RDONLY) { jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, method, signature, jPath, jBufferSize); } else if ((accmode == O_WRONLY) && (flags & O_APPEND)) { // WRITE/APPEND? jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, method, signature, jPath); } else { // WRITE/CREATE jboolean jOverWrite = 1; jlong jBlockSize = blockSize; if (jBlockSize == 0) { jthr = getDefaultBlockSize(env, jFS, jPath, &jBlockSize); if (jthr) { ret = EIO; goto done; } } jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, method, signature, jPath, jOverWrite, jBufferSize, jReplication, jBlockSize); } if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsOpenFile(%s): FileSystem#%s(%s)", path, method, signature); goto done; } jFile = jVal.l; file = calloc(1, sizeof(struct hdfsFile_internal)); if (!file) { fprintf(stderr, "hdfsOpenFile(%s): OOM create hdfsFile\n", path); ret = ENOMEM; goto done; } file->file = (*env)->NewGlobalRef(env, jFile); if (!file->file) { ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsOpenFile(%s): NewGlobalRef", path); goto done; } file->type = (((flags & O_WRONLY) == 0) ? HDFS_STREAM_INPUT : HDFS_STREAM_OUTPUT); file->flags = 0; if ((flags & O_WRONLY) == 0) { file->flags |= HDFS_FILE_SUPPORTS_DIRECT_READ; } ret = 0; done: destroyLocalReference(env, jStrBufferSize); destroyLocalReference(env, jStrReplication); destroyLocalReference(env, jConfiguration); destroyLocalReference(env, jPath); destroyLocalReference(env, jFile); if (ret) { if (file) { if (file->file) { (*env)->DeleteGlobalRef(env, file->file); } free(file); } errno = ret; return NULL; } return file; } int hdfsTruncateFile(hdfsFS fs, const char* path, tOffset newlength) { jobject jFS = (jobject)fs; jthrowable jthr; jvalue jVal; jobject jPath = NULL; JNIEnv *env = getJNIEnv(); if (!env) { errno = EINTERNAL; return -1; } /* Create an object of org.apache.hadoop.fs.Path */ jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsTruncateFile(%s): constructNewObjectOfPath", path); return -1; } jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "truncate", JMETHOD2(JPARAM(HADOOP_PATH), "J", "Z"), jPath, newlength); destroyLocalReference(env, jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsTruncateFile(%s): FileSystem#truncate", path); return -1; } if (jVal.z == JNI_TRUE) { return 1; } return 0; } int hdfsUnbufferFile(hdfsFile file) { int ret; jthrowable jthr; JNIEnv *env = getJNIEnv(); if (!env) { ret = EINTERNAL; goto done; } if (file->type != HDFS_STREAM_INPUT) { ret = ENOTSUP; goto done; } jthr = invokeMethod(env, NULL, INSTANCE, file->file, HADOOP_ISTRM, "unbuffer", "()V"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, HADOOP_ISTRM "#unbuffer failed:"); goto done; } ret = 0; done: errno = ret; return ret; } int hdfsCloseFile(hdfsFS fs, hdfsFile file) { int ret; // JAVA EQUIVALENT: // file.close //The interface whose 'close' method to be called const char *interface; const char *interfaceShortName; //Caught exception jthrowable jthr; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Sanity check if (!file || file->type == HDFS_STREAM_UNINITIALIZED) { errno = EBADF; return -1; } interface = (file->type == HDFS_STREAM_INPUT) ? HADOOP_ISTRM : HADOOP_OSTRM; jthr = invokeMethod(env, NULL, INSTANCE, file->file, interface, "close", "()V"); if (jthr) { interfaceShortName = (file->type == HDFS_STREAM_INPUT) ? "FSDataInputStream" : "FSDataOutputStream"; ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "%s#close", interfaceShortName); } else { ret = 0; } //De-allocate memory (*env)->DeleteGlobalRef(env, file->file); free(file); if (ret) { errno = ret; return -1; } return 0; } int hdfsExists(hdfsFS fs, const char *path) { JNIEnv *env = getJNIEnv(); jobject jPath; jvalue jVal; jobject jFS = (jobject)fs; jthrowable jthr; if (env == NULL) { errno = EINTERNAL; return -1; } if (path == NULL) { errno = EINVAL; return -1; } jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsExists: constructNewObjectOfPath"); return -1; } jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "exists", JMETHOD1(JPARAM(HADOOP_PATH), "Z"), jPath); destroyLocalReference(env, jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsExists: invokeMethod(%s)", JMETHOD1(JPARAM(HADOOP_PATH), "Z")); return -1; } if (jVal.z) { return 0; } else { errno = ENOENT; return -1; } } // Checks input file for readiness for reading. static int readPrepare(JNIEnv* env, hdfsFS fs, hdfsFile f, jobject* jInputStream) { *jInputStream = (jobject)(f ? f->file : NULL); //Sanity check if (!f || f->type == HDFS_STREAM_UNINITIALIZED) { errno = EBADF; return -1; } //Error checking... make sure that this file is 'readable' if (f->type != HDFS_STREAM_INPUT) { fprintf(stderr, "Cannot read from a non-InputStream object!\n"); errno = EINVAL; return -1; } return 0; } tSize hdfsRead(hdfsFS fs, hdfsFile f, void* buffer, tSize length) { jobject jInputStream; jbyteArray jbRarray; jint noReadBytes = length; jvalue jVal; jthrowable jthr; JNIEnv* env; tSize ret; if (length == 0) { return 0; } else if (length < 0) { errno = EINVAL; return -1; } if (f->flags & HDFS_FILE_SUPPORTS_DIRECT_READ) { if ((ret = readDirect(fs, f, buffer, length)) < 0) { if (errno != ENOTSUP) { return -1; } hdfsFileDisableDirectRead(f); } else { return ret; } } // JAVA EQUIVALENT: // byte [] bR = new byte[length]; // fis.read(bR); //Get the JNIEnv* corresponding to current thread env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Parameters if (readPrepare(env, fs, f, &jInputStream) == -1) { return -1; } //Read the requisite bytes jbRarray = (*env)->NewByteArray(env, length); if (!jbRarray) { errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsRead: NewByteArray"); return -1; } jthr = invokeMethod(env, &jVal, INSTANCE, jInputStream, HADOOP_ISTRM, "read", "([B)I", jbRarray); if (jthr) { destroyLocalReference(env, jbRarray); errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsRead: FSDataInputStream#read"); return -1; } if (jVal.i < 0) { // EOF destroyLocalReference(env, jbRarray); return 0; } else if (jVal.i == 0) { destroyLocalReference(env, jbRarray); errno = EINTR; return -1; } (*env)->GetByteArrayRegion(env, jbRarray, 0, noReadBytes, buffer); destroyLocalReference(env, jbRarray); if ((*env)->ExceptionCheck(env)) { errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsRead: GetByteArrayRegion"); return -1; } return jVal.i; } // Reads using the read(ByteBuffer) API, which does fewer copies tSize readDirect(hdfsFS fs, hdfsFile f, void* buffer, tSize length) { // JAVA EQUIVALENT: // ByteBuffer bbuffer = ByteBuffer.allocateDirect(length) // wraps C buffer // fis.read(bbuffer); jobject jInputStream; jvalue jVal; jthrowable jthr; jobject bb; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } if (readPrepare(env, fs, f, &jInputStream) == -1) { return -1; } //Read the requisite bytes bb = (*env)->NewDirectByteBuffer(env, buffer, length); if (bb == NULL) { errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "readDirect: NewDirectByteBuffer"); return -1; } jthr = invokeMethod(env, &jVal, INSTANCE, jInputStream, HADOOP_ISTRM, "read", "(Ljava/nio/ByteBuffer;)I", bb); destroyLocalReference(env, bb); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "readDirect: FSDataInputStream#read"); return -1; } return (jVal.i < 0) ? 0 : jVal.i; } tSize hdfsPread(hdfsFS fs, hdfsFile f, tOffset position, void* buffer, tSize length) { JNIEnv* env; jbyteArray jbRarray; jvalue jVal; jthrowable jthr; if (length == 0) { return 0; } else if (length < 0) { errno = EINVAL; return -1; } if (!f || f->type == HDFS_STREAM_UNINITIALIZED) { errno = EBADF; return -1; } env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Error checking... make sure that this file is 'readable' if (f->type != HDFS_STREAM_INPUT) { fprintf(stderr, "Cannot read from a non-InputStream object!\n"); errno = EINVAL; return -1; } // JAVA EQUIVALENT: // byte [] bR = new byte[length]; // fis.read(pos, bR, 0, length); jbRarray = (*env)->NewByteArray(env, length); if (!jbRarray) { errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsPread: NewByteArray"); return -1; } jthr = invokeMethod(env, &jVal, INSTANCE, f->file, HADOOP_ISTRM, "read", "(J[BII)I", position, jbRarray, 0, length); if (jthr) { destroyLocalReference(env, jbRarray); errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsPread: FSDataInputStream#read"); return -1; } if (jVal.i < 0) { // EOF destroyLocalReference(env, jbRarray); return 0; } else if (jVal.i == 0) { destroyLocalReference(env, jbRarray); errno = EINTR; return -1; } (*env)->GetByteArrayRegion(env, jbRarray, 0, jVal.i, buffer); destroyLocalReference(env, jbRarray); if ((*env)->ExceptionCheck(env)) { errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsPread: GetByteArrayRegion"); return -1; } return jVal.i; } tSize hdfsWrite(hdfsFS fs, hdfsFile f, const void* buffer, tSize length) { // JAVA EQUIVALENT // byte b[] = str.getBytes(); // fso.write(b); jobject jOutputStream; jbyteArray jbWarray; jthrowable jthr; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Sanity check if (!f || f->type == HDFS_STREAM_UNINITIALIZED) { errno = EBADF; return -1; } jOutputStream = f->file; if (length < 0) { errno = EINVAL; return -1; } //Error checking... make sure that this file is 'writable' if (f->type != HDFS_STREAM_OUTPUT) { fprintf(stderr, "Cannot write into a non-OutputStream object!\n"); errno = EINVAL; return -1; } if (length < 0) { errno = EINVAL; return -1; } if (length == 0) { return 0; } //Write the requisite bytes into the file jbWarray = (*env)->NewByteArray(env, length); if (!jbWarray) { errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsWrite: NewByteArray"); return -1; } (*env)->SetByteArrayRegion(env, jbWarray, 0, length, buffer); if ((*env)->ExceptionCheck(env)) { destroyLocalReference(env, jbWarray); errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsWrite(length = %d): SetByteArrayRegion", length); return -1; } jthr = invokeMethod(env, NULL, INSTANCE, jOutputStream, HADOOP_OSTRM, "write", "([B)V", jbWarray); destroyLocalReference(env, jbWarray); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsWrite: FSDataOutputStream#write"); return -1; } // Unlike most Java streams, FSDataOutputStream never does partial writes. // If we succeeded, all the data was written. return length; } int hdfsSeek(hdfsFS fs, hdfsFile f, tOffset desiredPos) { // JAVA EQUIVALENT // fis.seek(pos); jobject jInputStream; jthrowable jthr; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Sanity check if (!f || f->type != HDFS_STREAM_INPUT) { errno = EBADF; return -1; } jInputStream = f->file; jthr = invokeMethod(env, NULL, INSTANCE, jInputStream, HADOOP_ISTRM, "seek", "(J)V", desiredPos); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsSeek(desiredPos=%" PRId64 ")" ": FSDataInputStream#seek", desiredPos); return -1; } return 0; } tOffset hdfsTell(hdfsFS fs, hdfsFile f) { // JAVA EQUIVALENT // pos = f.getPos(); jobject jStream; const char *interface; jvalue jVal; jthrowable jthr; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Sanity check if (!f || f->type == HDFS_STREAM_UNINITIALIZED) { errno = EBADF; return -1; } //Parameters jStream = f->file; interface = (f->type == HDFS_STREAM_INPUT) ? HADOOP_ISTRM : HADOOP_OSTRM; jthr = invokeMethod(env, &jVal, INSTANCE, jStream, interface, "getPos", "()J"); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsTell: %s#getPos", ((f->type == HDFS_STREAM_INPUT) ? "FSDataInputStream" : "FSDataOutputStream")); return -1; } return jVal.j; } int hdfsFlush(hdfsFS fs, hdfsFile f) { // JAVA EQUIVALENT // fos.flush(); jthrowable jthr; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Sanity check if (!f || f->type != HDFS_STREAM_OUTPUT) { errno = EBADF; return -1; } jthr = invokeMethod(env, NULL, INSTANCE, f->file, HADOOP_OSTRM, "flush", "()V"); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsFlush: FSDataInputStream#flush"); return -1; } return 0; } int hdfsHFlush(hdfsFS fs, hdfsFile f) { jobject jOutputStream; jthrowable jthr; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Sanity check if (!f || f->type != HDFS_STREAM_OUTPUT) { errno = EBADF; return -1; } jOutputStream = f->file; jthr = invokeMethod(env, NULL, INSTANCE, jOutputStream, HADOOP_OSTRM, "hflush", "()V"); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsHFlush: FSDataOutputStream#hflush"); return -1; } return 0; } int hdfsHSync(hdfsFS fs, hdfsFile f) { jobject jOutputStream; jthrowable jthr; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Sanity check if (!f || f->type != HDFS_STREAM_OUTPUT) { errno = EBADF; return -1; } jOutputStream = f->file; jthr = invokeMethod(env, NULL, INSTANCE, jOutputStream, HADOOP_OSTRM, "hsync", "()V"); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsHSync: FSDataOutputStream#hsync"); return -1; } return 0; } int hdfsAvailable(hdfsFS fs, hdfsFile f) { // JAVA EQUIVALENT // fis.available(); jobject jInputStream; jvalue jVal; jthrowable jthr; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Sanity check if (!f || f->type != HDFS_STREAM_INPUT) { errno = EBADF; return -1; } //Parameters jInputStream = f->file; jthr = invokeMethod(env, &jVal, INSTANCE, jInputStream, HADOOP_ISTRM, "available", "()I"); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsAvailable: FSDataInputStream#available"); return -1; } return jVal.i; } static int hdfsCopyImpl(hdfsFS srcFS, const char *src, hdfsFS dstFS, const char *dst, jboolean deleteSource) { //JAVA EQUIVALENT // FileUtil#copy(srcFS, srcPath, dstFS, dstPath, // deleteSource = false, conf) //Parameters jobject jSrcFS = (jobject)srcFS; jobject jDstFS = (jobject)dstFS; jobject jConfiguration = NULL, jSrcPath = NULL, jDstPath = NULL; jthrowable jthr; jvalue jVal; int ret; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } jthr = constructNewObjectOfPath(env, src, &jSrcPath); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsCopyImpl(src=%s): constructNewObjectOfPath", src); goto done; } jthr = constructNewObjectOfPath(env, dst, &jDstPath); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsCopyImpl(dst=%s): constructNewObjectOfPath", dst); goto done; } //Create the org.apache.hadoop.conf.Configuration object jthr = constructNewObjectOfClass(env, &jConfiguration, HADOOP_CONF, "()V"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsCopyImpl: Configuration constructor"); goto done; } //FileUtil#copy jthr = invokeMethod(env, &jVal, STATIC, NULL, "org/apache/hadoop/fs/FileUtil", "copy", "(Lorg/apache/hadoop/fs/FileSystem;Lorg/apache/hadoop/fs/Path;" "Lorg/apache/hadoop/fs/FileSystem;Lorg/apache/hadoop/fs/Path;" "ZLorg/apache/hadoop/conf/Configuration;)Z", jSrcFS, jSrcPath, jDstFS, jDstPath, deleteSource, jConfiguration); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsCopyImpl(src=%s, dst=%s, deleteSource=%d): " "FileUtil#copy", src, dst, deleteSource); goto done; } if (!jVal.z) { ret = EIO; goto done; } ret = 0; done: destroyLocalReference(env, jConfiguration); destroyLocalReference(env, jSrcPath); destroyLocalReference(env, jDstPath); if (ret) { errno = ret; return -1; } return 0; } int hdfsCopy(hdfsFS srcFS, const char *src, hdfsFS dstFS, const char *dst) { return hdfsCopyImpl(srcFS, src, dstFS, dst, 0); } int hdfsMove(hdfsFS srcFS, const char *src, hdfsFS dstFS, const char *dst) { return hdfsCopyImpl(srcFS, src, dstFS, dst, 1); } int hdfsDelete(hdfsFS fs, const char *path, int recursive) { // JAVA EQUIVALENT: // Path p = new Path(path); // bool retval = fs.delete(p, recursive); jobject jFS = (jobject)fs; jthrowable jthr; jobject jPath; jvalue jVal; jboolean jRecursive; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsDelete(path=%s): constructNewObjectOfPath", path); return -1; } jRecursive = recursive ? JNI_TRUE : JNI_FALSE; jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "delete", "(Lorg/apache/hadoop/fs/Path;Z)Z", jPath, jRecursive); destroyLocalReference(env, jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsDelete(path=%s, recursive=%d): " "FileSystem#delete", path, recursive); return -1; } if (!jVal.z) { errno = EIO; return -1; } return 0; } int hdfsRename(hdfsFS fs, const char *oldPath, const char *newPath) { // JAVA EQUIVALENT: // Path old = new Path(oldPath); // Path new = new Path(newPath); // fs.rename(old, new); jobject jFS = (jobject)fs; jthrowable jthr; jobject jOldPath = NULL, jNewPath = NULL; int ret = -1; jvalue jVal; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } jthr = constructNewObjectOfPath(env, oldPath, &jOldPath ); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsRename: constructNewObjectOfPath(%s)", oldPath); goto done; } jthr = constructNewObjectOfPath(env, newPath, &jNewPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsRename: constructNewObjectOfPath(%s)", newPath); goto done; } // Rename the file // TODO: use rename2 here? (See HDFS-3592) jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "rename", JMETHOD2(JPARAM(HADOOP_PATH), JPARAM(HADOOP_PATH), "Z"), jOldPath, jNewPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsRename(oldPath=%s, newPath=%s): FileSystem#rename", oldPath, newPath); goto done; } if (!jVal.z) { errno = EIO; goto done; } ret = 0; done: destroyLocalReference(env, jOldPath); destroyLocalReference(env, jNewPath); return ret; } char* hdfsGetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize) { // JAVA EQUIVALENT: // Path p = fs.getWorkingDirectory(); // return p.toString() jobject jPath = NULL; jstring jPathString = NULL; jobject jFS = (jobject)fs; jvalue jVal; jthrowable jthr; int ret; const char *jPathChars = NULL; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return NULL; } //FileSystem#getWorkingDirectory() jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "getWorkingDirectory", "()Lorg/apache/hadoop/fs/Path;"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetWorkingDirectory: FileSystem#getWorkingDirectory"); goto done; } jPath = jVal.l; if (!jPath) { fprintf(stderr, "hdfsGetWorkingDirectory: " "FileSystem#getWorkingDirectory returned NULL"); ret = -EIO; goto done; } //Path#toString() jthr = invokeMethod(env, &jVal, INSTANCE, jPath, "org/apache/hadoop/fs/Path", "toString", "()Ljava/lang/String;"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetWorkingDirectory: Path#toString"); goto done; } jPathString = jVal.l; jPathChars = (*env)->GetStringUTFChars(env, jPathString, NULL); if (!jPathChars) { ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsGetWorkingDirectory: GetStringUTFChars"); goto done; } //Copy to user-provided buffer ret = snprintf(buffer, bufferSize, "%s", jPathChars); if (ret >= bufferSize) { ret = ENAMETOOLONG; goto done; } ret = 0; done: if (jPathChars) { (*env)->ReleaseStringUTFChars(env, jPathString, jPathChars); } destroyLocalReference(env, jPath); destroyLocalReference(env, jPathString); if (ret) { errno = ret; return NULL; } return buffer; } int hdfsSetWorkingDirectory(hdfsFS fs, const char *path) { // JAVA EQUIVALENT: // fs.setWorkingDirectory(Path(path)); jobject jFS = (jobject)fs; jthrowable jthr; jobject jPath; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Create an object of org.apache.hadoop.fs.Path jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsSetWorkingDirectory(%s): constructNewObjectOfPath", path); return -1; } //FileSystem#setWorkingDirectory() jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS, "setWorkingDirectory", "(Lorg/apache/hadoop/fs/Path;)V", jPath); destroyLocalReference(env, jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, NOPRINT_EXC_ILLEGAL_ARGUMENT, "hdfsSetWorkingDirectory(%s): FileSystem#setWorkingDirectory", path); return -1; } return 0; } int hdfsCreateDirectory(hdfsFS fs, const char *path) { // JAVA EQUIVALENT: // fs.mkdirs(new Path(path)); jobject jFS = (jobject)fs; jobject jPath; jthrowable jthr; jvalue jVal; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Create an object of org.apache.hadoop.fs.Path jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsCreateDirectory(%s): constructNewObjectOfPath", path); return -1; } //Create the directory jVal.z = 0; jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "mkdirs", "(Lorg/apache/hadoop/fs/Path;)Z", jPath); destroyLocalReference(env, jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, NOPRINT_EXC_ACCESS_CONTROL | NOPRINT_EXC_FILE_NOT_FOUND | NOPRINT_EXC_UNRESOLVED_LINK | NOPRINT_EXC_PARENT_NOT_DIRECTORY, "hdfsCreateDirectory(%s): FileSystem#mkdirs", path); return -1; } if (!jVal.z) { // It's unclear under exactly which conditions FileSystem#mkdirs // is supposed to return false (as opposed to throwing an exception.) // It seems like the current code never actually returns false. // So we're going to translate this to EIO, since there seems to be // nothing more specific we can do with it. errno = EIO; return -1; } return 0; } int hdfsSetReplication(hdfsFS fs, const char *path, int16_t replication) { // JAVA EQUIVALENT: // fs.setReplication(new Path(path), replication); jobject jFS = (jobject)fs; jthrowable jthr; jobject jPath; jvalue jVal; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Create an object of org.apache.hadoop.fs.Path jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsSetReplication(path=%s): constructNewObjectOfPath", path); return -1; } //Create the directory jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "setReplication", "(Lorg/apache/hadoop/fs/Path;S)Z", jPath, replication); destroyLocalReference(env, jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsSetReplication(path=%s, replication=%d): " "FileSystem#setReplication", path, replication); return -1; } if (!jVal.z) { // setReplication returns false "if file does not exist or is a // directory." So the nearest translation to that is ENOENT. errno = ENOENT; return -1; } return 0; } int hdfsChown(hdfsFS fs, const char *path, const char *owner, const char *group) { // JAVA EQUIVALENT: // fs.setOwner(path, owner, group) jobject jFS = (jobject)fs; jobject jPath = NULL; jstring jOwner = NULL, jGroup = NULL; jthrowable jthr; int ret; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } if (owner == NULL && group == NULL) { return 0; } jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsChown(path=%s): constructNewObjectOfPath", path); goto done; } jthr = newJavaStr(env, owner, &jOwner); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsChown(path=%s): newJavaStr(%s)", path, owner); goto done; } jthr = newJavaStr(env, group, &jGroup); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsChown(path=%s): newJavaStr(%s)", path, group); goto done; } //Create the directory jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS, "setOwner", JMETHOD3(JPARAM(HADOOP_PATH), JPARAM(JAVA_STRING), JPARAM(JAVA_STRING), JAVA_VOID), jPath, jOwner, jGroup); if (jthr) { ret = printExceptionAndFree(env, jthr, NOPRINT_EXC_ACCESS_CONTROL | NOPRINT_EXC_FILE_NOT_FOUND | NOPRINT_EXC_UNRESOLVED_LINK, "hdfsChown(path=%s, owner=%s, group=%s): " "FileSystem#setOwner", path, owner, group); goto done; } ret = 0; done: destroyLocalReference(env, jPath); destroyLocalReference(env, jOwner); destroyLocalReference(env, jGroup); if (ret) { errno = ret; return -1; } return 0; } int hdfsChmod(hdfsFS fs, const char *path, short mode) { int ret; // JAVA EQUIVALENT: // fs.setPermission(path, FsPermission) jthrowable jthr; jobject jPath = NULL, jPermObj = NULL; jobject jFS = (jobject)fs; jshort jmode = mode; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } // construct jPerm = FsPermission.createImmutable(short mode); jthr = constructNewObjectOfClass(env, &jPermObj, HADOOP_FSPERM,"(S)V",jmode); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "constructNewObjectOfClass(%s)", HADOOP_FSPERM); return -1; } //Create an object of org.apache.hadoop.fs.Path jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsChmod(%s): constructNewObjectOfPath", path); goto done; } //Create the directory jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS, "setPermission", JMETHOD2(JPARAM(HADOOP_PATH), JPARAM(HADOOP_FSPERM), JAVA_VOID), jPath, jPermObj); if (jthr) { ret = printExceptionAndFree(env, jthr, NOPRINT_EXC_ACCESS_CONTROL | NOPRINT_EXC_FILE_NOT_FOUND | NOPRINT_EXC_UNRESOLVED_LINK, "hdfsChmod(%s): FileSystem#setPermission", path); goto done; } ret = 0; done: destroyLocalReference(env, jPath); destroyLocalReference(env, jPermObj); if (ret) { errno = ret; return -1; } return 0; } int hdfsUtime(hdfsFS fs, const char *path, tTime mtime, tTime atime) { // JAVA EQUIVALENT: // fs.setTimes(src, mtime, atime) jthrowable jthr; jobject jFS = (jobject)fs; jobject jPath; static const tTime NO_CHANGE = -1; jlong jmtime, jatime; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //Create an object of org.apache.hadoop.fs.Path jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsUtime(path=%s): constructNewObjectOfPath", path); return -1; } jmtime = (mtime == NO_CHANGE) ? -1 : (mtime * (jlong)1000); jatime = (atime == NO_CHANGE) ? -1 : (atime * (jlong)1000); jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS, "setTimes", JMETHOD3(JPARAM(HADOOP_PATH), "J", "J", JAVA_VOID), jPath, jmtime, jatime); destroyLocalReference(env, jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, NOPRINT_EXC_ACCESS_CONTROL | NOPRINT_EXC_FILE_NOT_FOUND | NOPRINT_EXC_UNRESOLVED_LINK, "hdfsUtime(path=%s): FileSystem#setTimes", path); return -1; } return 0; } /** * Zero-copy options. * * We cache the EnumSet of ReadOptions which has to be passed into every * readZero call, to avoid reconstructing it each time. This cache is cleared * whenever an element changes. */ struct hadoopRzOptions { JNIEnv *env; int skipChecksums; jobject byteBufferPool; jobject cachedEnumSet; }; struct hadoopRzOptions *hadoopRzOptionsAlloc(void) { struct hadoopRzOptions *opts; JNIEnv *env; env = getJNIEnv(); if (!env) { // Check to make sure the JNI environment is set up properly. errno = EINTERNAL; return NULL; } opts = calloc(1, sizeof(struct hadoopRzOptions)); if (!opts) { errno = ENOMEM; return NULL; } return opts; } static void hadoopRzOptionsClearCached(JNIEnv *env, struct hadoopRzOptions *opts) { if (!opts->cachedEnumSet) { return; } (*env)->DeleteGlobalRef(env, opts->cachedEnumSet); opts->cachedEnumSet = NULL; } int hadoopRzOptionsSetSkipChecksum( struct hadoopRzOptions *opts, int skip) { JNIEnv *env; env = getJNIEnv(); if (!env) { errno = EINTERNAL; return -1; } hadoopRzOptionsClearCached(env, opts); opts->skipChecksums = !!skip; return 0; } int hadoopRzOptionsSetByteBufferPool( struct hadoopRzOptions *opts, const char *className) { JNIEnv *env; jthrowable jthr; jobject byteBufferPool = NULL; env = getJNIEnv(); if (!env) { errno = EINTERNAL; return -1; } if (className) { // Note: we don't have to call hadoopRzOptionsClearCached in this // function, since the ByteBufferPool is passed separately from the // EnumSet of ReadOptions. jthr = constructNewObjectOfClass(env, &byteBufferPool, className, "()V"); if (jthr) { printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hadoopRzOptionsSetByteBufferPool(className=%s): ", className); errno = EINVAL; return -1; } } if (opts->byteBufferPool) { // Delete any previous ByteBufferPool we had. (*env)->DeleteGlobalRef(env, opts->byteBufferPool); } opts->byteBufferPool = byteBufferPool; return 0; } void hadoopRzOptionsFree(struct hadoopRzOptions *opts) { JNIEnv *env; env = getJNIEnv(); if (!env) { return; } hadoopRzOptionsClearCached(env, opts); if (opts->byteBufferPool) { (*env)->DeleteGlobalRef(env, opts->byteBufferPool); opts->byteBufferPool = NULL; } free(opts); } struct hadoopRzBuffer { jobject byteBuffer; uint8_t *ptr; int32_t length; int direct; }; static jthrowable hadoopRzOptionsGetEnumSet(JNIEnv *env, struct hadoopRzOptions *opts, jobject *enumSet) { jthrowable jthr = NULL; jobject enumInst = NULL, enumSetObj = NULL; jvalue jVal; if (opts->cachedEnumSet) { // If we cached the value, return it now. *enumSet = opts->cachedEnumSet; goto done; } if (opts->skipChecksums) { jthr = fetchEnumInstance(env, READ_OPTION, "SKIP_CHECKSUMS", &enumInst); if (jthr) { goto done; } jthr = invokeMethod(env, &jVal, STATIC, NULL, "java/util/EnumSet", "of", "(Ljava/lang/Enum;)Ljava/util/EnumSet;", enumInst); if (jthr) { goto done; } enumSetObj = jVal.l; } else { jclass clazz = (*env)->FindClass(env, READ_OPTION); if (!clazz) { jthr = newRuntimeError(env, "failed " "to find class for %s", READ_OPTION); goto done; } jthr = invokeMethod(env, &jVal, STATIC, NULL, "java/util/EnumSet", "noneOf", "(Ljava/lang/Class;)Ljava/util/EnumSet;", clazz); enumSetObj = jVal.l; } // create global ref opts->cachedEnumSet = (*env)->NewGlobalRef(env, enumSetObj); if (!opts->cachedEnumSet) { jthr = getPendingExceptionAndClear(env); goto done; } *enumSet = opts->cachedEnumSet; jthr = NULL; done: (*env)->DeleteLocalRef(env, enumInst); (*env)->DeleteLocalRef(env, enumSetObj); return jthr; } static int hadoopReadZeroExtractBuffer(JNIEnv *env, const struct hadoopRzOptions *opts, struct hadoopRzBuffer *buffer) { int ret; jthrowable jthr; jvalue jVal; uint8_t *directStart; void *mallocBuf = NULL; jint position; jarray array = NULL; jthr = invokeMethod(env, &jVal, INSTANCE, buffer->byteBuffer, "java/nio/ByteBuffer", "remaining", "()I"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hadoopReadZeroExtractBuffer: ByteBuffer#remaining failed: "); goto done; } buffer->length = jVal.i; jthr = invokeMethod(env, &jVal, INSTANCE, buffer->byteBuffer, "java/nio/ByteBuffer", "position", "()I"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hadoopReadZeroExtractBuffer: ByteBuffer#position failed: "); goto done; } position = jVal.i; directStart = (*env)->GetDirectBufferAddress(env, buffer->byteBuffer); if (directStart) { // Handle direct buffers. buffer->ptr = directStart + position; buffer->direct = 1; ret = 0; goto done; } // Handle indirect buffers. // The JNI docs don't say that GetDirectBufferAddress throws any exceptions // when it fails. However, they also don't clearly say that it doesn't. It // seems safest to clear any pending exceptions here, to prevent problems on // various JVMs. (*env)->ExceptionClear(env); if (!opts->byteBufferPool) { fputs("hadoopReadZeroExtractBuffer: we read through the " "zero-copy path, but failed to get the address of the buffer via " "GetDirectBufferAddress. Please make sure your JVM supports " "GetDirectBufferAddress.\n", stderr); ret = ENOTSUP; goto done; } // Get the backing array object of this buffer. jthr = invokeMethod(env, &jVal, INSTANCE, buffer->byteBuffer, "java/nio/ByteBuffer", "array", "()[B"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hadoopReadZeroExtractBuffer: ByteBuffer#array failed: "); goto done; } array = jVal.l; if (!array) { fputs("hadoopReadZeroExtractBuffer: ByteBuffer#array returned NULL.", stderr); ret = EIO; goto done; } mallocBuf = malloc(buffer->length); if (!mallocBuf) { fprintf(stderr, "hadoopReadZeroExtractBuffer: failed to allocate %d bytes of memory\n", buffer->length); ret = ENOMEM; goto done; } (*env)->GetByteArrayRegion(env, array, position, buffer->length, mallocBuf); jthr = (*env)->ExceptionOccurred(env); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hadoopReadZeroExtractBuffer: GetByteArrayRegion failed: "); goto done; } buffer->ptr = mallocBuf; buffer->direct = 0; ret = 0; done: free(mallocBuf); (*env)->DeleteLocalRef(env, array); return ret; } static int translateZCRException(JNIEnv *env, jthrowable exc) { int ret; char *className = NULL; jthrowable jthr = classNameOfObject(exc, env, &className); if (jthr) { fputs("hadoopReadZero: failed to get class name of " "exception from read().\n", stderr); destroyLocalReference(env, exc); destroyLocalReference(env, jthr); ret = EIO; goto done; } if (!strcmp(className, "java.lang.UnsupportedOperationException")) { ret = EPROTONOSUPPORT; goto done; } ret = printExceptionAndFree(env, exc, PRINT_EXC_ALL, "hadoopZeroCopyRead: ZeroCopyCursor#read failed"); done: free(className); return ret; } struct hadoopRzBuffer* hadoopReadZero(hdfsFile file, struct hadoopRzOptions *opts, int32_t maxLength) { JNIEnv *env; jthrowable jthr = NULL; jvalue jVal; jobject enumSet = NULL, byteBuffer = NULL; struct hadoopRzBuffer* buffer = NULL; int ret; env = getJNIEnv(); if (!env) { errno = EINTERNAL; return NULL; } if (file->type != HDFS_STREAM_INPUT) { fputs("Cannot read from a non-InputStream object!\n", stderr); ret = EINVAL; goto done; } buffer = calloc(1, sizeof(struct hadoopRzBuffer)); if (!buffer) { ret = ENOMEM; goto done; } jthr = hadoopRzOptionsGetEnumSet(env, opts, &enumSet); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hadoopReadZero: hadoopRzOptionsGetEnumSet failed: "); goto done; } jthr = invokeMethod(env, &jVal, INSTANCE, file->file, HADOOP_ISTRM, "read", "(Lorg/apache/hadoop/io/ByteBufferPool;ILjava/util/EnumSet;)" "Ljava/nio/ByteBuffer;", opts->byteBufferPool, maxLength, enumSet); if (jthr) { ret = translateZCRException(env, jthr); goto done; } byteBuffer = jVal.l; if (!byteBuffer) { buffer->byteBuffer = NULL; buffer->length = 0; buffer->ptr = NULL; } else { buffer->byteBuffer = (*env)->NewGlobalRef(env, byteBuffer); if (!buffer->byteBuffer) { ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hadoopReadZero: failed to create global ref to ByteBuffer"); goto done; } ret = hadoopReadZeroExtractBuffer(env, opts, buffer); if (ret) { goto done; } } ret = 0; done: (*env)->DeleteLocalRef(env, byteBuffer); if (ret) { if (buffer) { if (buffer->byteBuffer) { (*env)->DeleteGlobalRef(env, buffer->byteBuffer); } free(buffer); } errno = ret; return NULL; } else { errno = 0; } return buffer; } int32_t hadoopRzBufferLength(const struct hadoopRzBuffer *buffer) { return buffer->length; } const void *hadoopRzBufferGet(const struct hadoopRzBuffer *buffer) { return buffer->ptr; } void hadoopRzBufferFree(hdfsFile file, struct hadoopRzBuffer *buffer) { jvalue jVal; jthrowable jthr; JNIEnv* env; env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return; } if (buffer->byteBuffer) { jthr = invokeMethod(env, &jVal, INSTANCE, file->file, HADOOP_ISTRM, "releaseBuffer", "(Ljava/nio/ByteBuffer;)V", buffer->byteBuffer); if (jthr) { printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hadoopRzBufferFree: releaseBuffer failed: "); // even on error, we have to delete the reference. } (*env)->DeleteGlobalRef(env, buffer->byteBuffer); } if (!buffer->direct) { free(buffer->ptr); } memset(buffer, 0, sizeof(*buffer)); free(buffer); } char*** hdfsGetHosts(hdfsFS fs, const char *path, tOffset start, tOffset length) { // JAVA EQUIVALENT: // fs.getFileBlockLoctions(new Path(path), start, length); jobject jFS = (jobject)fs; jthrowable jthr; jobject jPath = NULL; jobject jFileStatus = NULL; jvalue jFSVal, jVal; jobjectArray jBlockLocations = NULL, jFileBlockHosts = NULL; jstring jHost = NULL; char*** blockHosts = NULL; int i, j, ret; jsize jNumFileBlocks = 0; jobject jFileBlock; jsize jNumBlockHosts; const char *hostName; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return NULL; } //Create an object of org.apache.hadoop.fs.Path jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetHosts(path=%s): constructNewObjectOfPath", path); goto done; } jthr = invokeMethod(env, &jFSVal, INSTANCE, jFS, HADOOP_FS, "getFileStatus", "(Lorg/apache/hadoop/fs/Path;)" "Lorg/apache/hadoop/fs/FileStatus;", jPath); if (jthr) { ret = printExceptionAndFree(env, jthr, NOPRINT_EXC_FILE_NOT_FOUND, "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):" "FileSystem#getFileStatus", path, start, length); destroyLocalReference(env, jPath); goto done; } jFileStatus = jFSVal.l; //org.apache.hadoop.fs.FileSystem#getFileBlockLocations jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "getFileBlockLocations", "(Lorg/apache/hadoop/fs/FileStatus;JJ)" "[Lorg/apache/hadoop/fs/BlockLocation;", jFileStatus, start, length); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):" "FileSystem#getFileBlockLocations", path, start, length); goto done; } jBlockLocations = jVal.l; //Figure out no of entries in jBlockLocations //Allocate memory and add NULL at the end jNumFileBlocks = (*env)->GetArrayLength(env, jBlockLocations); blockHosts = calloc(jNumFileBlocks + 1, sizeof(char**)); if (blockHosts == NULL) { ret = ENOMEM; goto done; } if (jNumFileBlocks == 0) { ret = 0; goto done; } //Now parse each block to get hostnames for (i = 0; i < jNumFileBlocks; ++i) { jFileBlock = (*env)->GetObjectArrayElement(env, jBlockLocations, i); if (!jFileBlock) { ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):" "GetObjectArrayElement(%d)", path, start, length, i); goto done; } jthr = invokeMethod(env, &jVal, INSTANCE, jFileBlock, HADOOP_BLK_LOC, "getHosts", "()[Ljava/lang/String;"); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):" "BlockLocation#getHosts", path, start, length); goto done; } jFileBlockHosts = jVal.l; if (!jFileBlockHosts) { fprintf(stderr, "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):" "BlockLocation#getHosts returned NULL", path, start, length); ret = EINTERNAL; goto done; } //Figure out no of hosts in jFileBlockHosts, and allocate the memory jNumBlockHosts = (*env)->GetArrayLength(env, jFileBlockHosts); blockHosts[i] = calloc(jNumBlockHosts + 1, sizeof(char*)); if (!blockHosts[i]) { ret = ENOMEM; goto done; } //Now parse each hostname for (j = 0; j < jNumBlockHosts; ++j) { jHost = (*env)->GetObjectArrayElement(env, jFileBlockHosts, j); if (!jHost) { ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"): " "NewByteArray", path, start, length); goto done; } hostName = (const char*)((*env)->GetStringUTFChars(env, jHost, NULL)); if (!hostName) { ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64", " "j=%d out of %d): GetStringUTFChars", path, start, length, j, jNumBlockHosts); goto done; } blockHosts[i][j] = strdup(hostName); (*env)->ReleaseStringUTFChars(env, jHost, hostName); if (!blockHosts[i][j]) { ret = ENOMEM; goto done; } destroyLocalReference(env, jHost); jHost = NULL; } destroyLocalReference(env, jFileBlockHosts); jFileBlockHosts = NULL; } ret = 0; done: destroyLocalReference(env, jPath); destroyLocalReference(env, jFileStatus); destroyLocalReference(env, jBlockLocations); destroyLocalReference(env, jFileBlockHosts); destroyLocalReference(env, jHost); if (ret) { if (blockHosts) { hdfsFreeHosts(blockHosts); } return NULL; } return blockHosts; } void hdfsFreeHosts(char ***blockHosts) { int i, j; for (i=0; blockHosts[i]; i++) { for (j=0; blockHosts[i][j]; j++) { free(blockHosts[i][j]); } free(blockHosts[i]); } free(blockHosts); } tOffset hdfsGetDefaultBlockSize(hdfsFS fs) { // JAVA EQUIVALENT: // fs.getDefaultBlockSize(); jobject jFS = (jobject)fs; jvalue jVal; jthrowable jthr; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //FileSystem#getDefaultBlockSize() jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "getDefaultBlockSize", "()J"); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetDefaultBlockSize: FileSystem#getDefaultBlockSize"); return -1; } return jVal.j; } tOffset hdfsGetDefaultBlockSizeAtPath(hdfsFS fs, const char *path) { // JAVA EQUIVALENT: // fs.getDefaultBlockSize(path); jthrowable jthr; jobject jFS = (jobject)fs; jobject jPath; tOffset blockSize; JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetDefaultBlockSize(path=%s): constructNewObjectOfPath", path); return -1; } jthr = getDefaultBlockSize(env, jFS, jPath, &blockSize); (*env)->DeleteLocalRef(env, jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetDefaultBlockSize(path=%s): " "FileSystem#getDefaultBlockSize", path); return -1; } return blockSize; } tOffset hdfsGetCapacity(hdfsFS fs) { // JAVA EQUIVALENT: // FsStatus fss = fs.getStatus(); // return Fss.getCapacity(); jobject jFS = (jobject)fs; jvalue jVal; jthrowable jthr; jobject fss; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //FileSystem#getStatus jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "getStatus", "()Lorg/apache/hadoop/fs/FsStatus;"); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetCapacity: FileSystem#getStatus"); return -1; } fss = (jobject)jVal.l; jthr = invokeMethod(env, &jVal, INSTANCE, fss, HADOOP_FSSTATUS, "getCapacity", "()J"); destroyLocalReference(env, fss); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetCapacity: FsStatus#getCapacity"); return -1; } return jVal.j; } tOffset hdfsGetUsed(hdfsFS fs) { // JAVA EQUIVALENT: // FsStatus fss = fs.getStatus(); // return Fss.getUsed(); jobject jFS = (jobject)fs; jvalue jVal; jthrowable jthr; jobject fss; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return -1; } //FileSystem#getStatus jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "getStatus", "()Lorg/apache/hadoop/fs/FsStatus;"); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetUsed: FileSystem#getStatus"); return -1; } fss = (jobject)jVal.l; jthr = invokeMethod(env, &jVal, INSTANCE, fss, HADOOP_FSSTATUS, "getUsed", "()J"); destroyLocalReference(env, fss); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetUsed: FsStatus#getUsed"); return -1; } return jVal.j; } /** * We cannot add new fields to the hdfsFileInfo structure because it would break * binary compatibility. The reason is because we return an array * of hdfsFileInfo structures from hdfsListDirectory. So changing the size of * those structures would break all programs that relied on finding the second * element in the array at + sizeof(struct hdfsFileInfo). * * So instead, we add the new fields to the hdfsExtendedFileInfo structure. * This structure is contained in the mOwner string found inside the * hdfsFileInfo. Specifically, the format of mOwner is: * * [owner-string] [null byte] [padding] [hdfsExtendedFileInfo structure] * * The padding is added so that the hdfsExtendedFileInfo structure starts on an * 8-byte boundary. * * @param str The string to locate the extended info in. * @return The offset of the hdfsExtendedFileInfo structure. */ static size_t getExtendedFileInfoOffset(const char *str) { int num_64_bit_words = ((strlen(str) + 1) + 7) / 8; return num_64_bit_words * 8; } static struct hdfsExtendedFileInfo *getExtendedFileInfo(hdfsFileInfo *fileInfo) { char *owner = fileInfo->mOwner; return (struct hdfsExtendedFileInfo *)(owner + getExtendedFileInfoOffset(owner)); } static jthrowable getFileInfoFromStat(JNIEnv *env, jobject jStat, hdfsFileInfo *fileInfo) { jvalue jVal; jthrowable jthr; jobject jPath = NULL; jstring jPathName = NULL; jstring jUserName = NULL; jstring jGroupName = NULL; jobject jPermission = NULL; const char *cPathName; const char *cUserName; const char *cGroupName; struct hdfsExtendedFileInfo *extInfo; size_t extOffset; jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT, "isDir", "()Z"); if (jthr) goto done; fileInfo->mKind = jVal.z ? kObjectKindDirectory : kObjectKindFile; jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT, "getReplication", "()S"); if (jthr) goto done; fileInfo->mReplication = jVal.s; jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT, "getBlockSize", "()J"); if (jthr) goto done; fileInfo->mBlockSize = jVal.j; jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT, "getModificationTime", "()J"); if (jthr) goto done; fileInfo->mLastMod = jVal.j / 1000; jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT, "getAccessTime", "()J"); if (jthr) goto done; fileInfo->mLastAccess = (tTime) (jVal.j / 1000); if (fileInfo->mKind == kObjectKindFile) { jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT, "getLen", "()J"); if (jthr) goto done; fileInfo->mSize = jVal.j; } jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT, "getPath", "()Lorg/apache/hadoop/fs/Path;"); if (jthr) goto done; jPath = jVal.l; if (jPath == NULL) { jthr = newRuntimeError(env, "org.apache.hadoop.fs.FileStatus#" "getPath returned NULL!"); goto done; } jthr = invokeMethod(env, &jVal, INSTANCE, jPath, HADOOP_PATH, "toString", "()Ljava/lang/String;"); if (jthr) goto done; jPathName = jVal.l; cPathName = (const char*) ((*env)->GetStringUTFChars(env, jPathName, NULL)); if (!cPathName) { jthr = getPendingExceptionAndClear(env); goto done; } fileInfo->mName = strdup(cPathName); (*env)->ReleaseStringUTFChars(env, jPathName, cPathName); jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT, "getOwner", "()Ljava/lang/String;"); if (jthr) goto done; jUserName = jVal.l; cUserName = (const char*) ((*env)->GetStringUTFChars(env, jUserName, NULL)); if (!cUserName) { jthr = getPendingExceptionAndClear(env); goto done; } extOffset = getExtendedFileInfoOffset(cUserName); fileInfo->mOwner = malloc(extOffset + sizeof(struct hdfsExtendedFileInfo)); if (!fileInfo->mOwner) { jthr = newRuntimeError(env, "getFileInfo: OOM allocating mOwner"); goto done; } strcpy(fileInfo->mOwner, cUserName); (*env)->ReleaseStringUTFChars(env, jUserName, cUserName); extInfo = getExtendedFileInfo(fileInfo); memset(extInfo, 0, sizeof(*extInfo)); jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT, "isEncrypted", "()Z"); if (jthr) { goto done; } if (jVal.z == JNI_TRUE) { extInfo->flags |= HDFS_EXTENDED_FILE_INFO_ENCRYPTED; } jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT, "getGroup", "()Ljava/lang/String;"); if (jthr) goto done; jGroupName = jVal.l; cGroupName = (const char*) ((*env)->GetStringUTFChars(env, jGroupName, NULL)); if (!cGroupName) { jthr = getPendingExceptionAndClear(env); goto done; } fileInfo->mGroup = strdup(cGroupName); (*env)->ReleaseStringUTFChars(env, jGroupName, cGroupName); jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT, "getPermission", "()Lorg/apache/hadoop/fs/permission/FsPermission;"); if (jthr) goto done; if (jVal.l == NULL) { jthr = newRuntimeError(env, "%s#getPermission returned NULL!", HADOOP_STAT); goto done; } jPermission = jVal.l; jthr = invokeMethod(env, &jVal, INSTANCE, jPermission, HADOOP_FSPERM, "toShort", "()S"); if (jthr) goto done; fileInfo->mPermissions = jVal.s; jthr = NULL; done: if (jthr) hdfsFreeFileInfoEntry(fileInfo); destroyLocalReference(env, jPath); destroyLocalReference(env, jPathName); destroyLocalReference(env, jUserName); destroyLocalReference(env, jGroupName); destroyLocalReference(env, jPermission); destroyLocalReference(env, jPath); return jthr; } static jthrowable getFileInfo(JNIEnv *env, jobject jFS, jobject jPath, hdfsFileInfo **fileInfo) { // JAVA EQUIVALENT: // fs.isDirectory(f) // fs.getModificationTime() // fs.getAccessTime() // fs.getLength(f) // f.getPath() // f.getOwner() // f.getGroup() // f.getPermission().toShort() jobject jStat; jvalue jVal; jthrowable jthr; jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "exists", JMETHOD1(JPARAM(HADOOP_PATH), "Z"), jPath); if (jthr) return jthr; if (jVal.z == 0) { *fileInfo = NULL; return NULL; } jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "getFileStatus", JMETHOD1(JPARAM(HADOOP_PATH), JPARAM(HADOOP_STAT)), jPath); if (jthr) return jthr; jStat = jVal.l; *fileInfo = calloc(1, sizeof(hdfsFileInfo)); if (!*fileInfo) { destroyLocalReference(env, jStat); return newRuntimeError(env, "getFileInfo: OOM allocating hdfsFileInfo"); } jthr = getFileInfoFromStat(env, jStat, *fileInfo); destroyLocalReference(env, jStat); return jthr; } hdfsFileInfo* hdfsListDirectory(hdfsFS fs, const char *path, int *numEntries) { // JAVA EQUIVALENT: // Path p(path); // Path []pathList = fs.listPaths(p) // foreach path in pathList // getFileInfo(path) jobject jFS = (jobject)fs; jthrowable jthr; jobject jPath = NULL; hdfsFileInfo *pathList = NULL; jobjectArray jPathList = NULL; jvalue jVal; jsize jPathListSize = 0; int ret; jsize i; jobject tmpStat; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return NULL; } //Create an object of org.apache.hadoop.fs.Path jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsListDirectory(%s): constructNewObjectOfPath", path); goto done; } jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_DFS, "listStatus", JMETHOD1(JPARAM(HADOOP_PATH), JARRPARAM(HADOOP_STAT)), jPath); if (jthr) { ret = printExceptionAndFree(env, jthr, NOPRINT_EXC_ACCESS_CONTROL | NOPRINT_EXC_FILE_NOT_FOUND | NOPRINT_EXC_UNRESOLVED_LINK, "hdfsListDirectory(%s): FileSystem#listStatus", path); goto done; } jPathList = jVal.l; //Figure out the number of entries in that directory jPathListSize = (*env)->GetArrayLength(env, jPathList); if (jPathListSize == 0) { ret = 0; goto done; } //Allocate memory pathList = calloc(jPathListSize, sizeof(hdfsFileInfo)); if (pathList == NULL) { ret = ENOMEM; goto done; } //Save path information in pathList for (i=0; i < jPathListSize; ++i) { tmpStat = (*env)->GetObjectArrayElement(env, jPathList, i); if (!tmpStat) { ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "hdfsListDirectory(%s): GetObjectArrayElement(%d out of %d)", path, i, jPathListSize); goto done; } jthr = getFileInfoFromStat(env, tmpStat, &pathList[i]); destroyLocalReference(env, tmpStat); if (jthr) { ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsListDirectory(%s): getFileInfoFromStat(%d out of %d)", path, i, jPathListSize); goto done; } } ret = 0; done: destroyLocalReference(env, jPath); destroyLocalReference(env, jPathList); if (ret) { hdfsFreeFileInfo(pathList, jPathListSize); errno = ret; return NULL; } *numEntries = jPathListSize; errno = 0; return pathList; } hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char *path) { // JAVA EQUIVALENT: // File f(path); // fs.isDirectory(f) // fs.lastModified() ?? // fs.getLength(f) // f.getPath() jobject jFS = (jobject)fs; jobject jPath; jthrowable jthr; hdfsFileInfo *fileInfo; //Get the JNIEnv* corresponding to current thread JNIEnv* env = getJNIEnv(); if (env == NULL) { errno = EINTERNAL; return NULL; } //Create an object of org.apache.hadoop.fs.Path jthr = constructNewObjectOfPath(env, path, &jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "hdfsGetPathInfo(%s): constructNewObjectOfPath", path); return NULL; } jthr = getFileInfo(env, jFS, jPath, &fileInfo); destroyLocalReference(env, jPath); if (jthr) { errno = printExceptionAndFree(env, jthr, NOPRINT_EXC_ACCESS_CONTROL | NOPRINT_EXC_FILE_NOT_FOUND | NOPRINT_EXC_UNRESOLVED_LINK, "hdfsGetPathInfo(%s): getFileInfo", path); return NULL; } if (!fileInfo) { errno = ENOENT; return NULL; } return fileInfo; } static void hdfsFreeFileInfoEntry(hdfsFileInfo *hdfsFileInfo) { free(hdfsFileInfo->mName); free(hdfsFileInfo->mOwner); free(hdfsFileInfo->mGroup); memset(hdfsFileInfo, 0, sizeof(*hdfsFileInfo)); } void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries) { //Free the mName, mOwner, and mGroup int i; for (i=0; i < numEntries; ++i) { hdfsFreeFileInfoEntry(hdfsFileInfo + i); } //Free entire block free(hdfsFileInfo); } int hdfsFileIsEncrypted(hdfsFileInfo *fileInfo) { struct hdfsExtendedFileInfo *extInfo; extInfo = getExtendedFileInfo(fileInfo); return !!(extInfo->flags & HDFS_EXTENDED_FILE_INFO_ENCRYPTED); } /** * vim: ts=4: sw=4: et: */ ================================================ FILE: src/libhdfs/include/hdfs/hdfs.h ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBHDFS_HDFS_H #define LIBHDFS_HDFS_H #include /* for EINTERNAL, etc. */ #include /* for O_RDONLY, O_WRONLY */ #include /* for uint64_t, etc. */ #include /* for time_t */ /* * Support export of DLL symbols during libhdfs build, and import of DLL symbols * during client application build. A client application may optionally define * symbol LIBHDFS_DLL_IMPORT in its build. This is not strictly required, but * the compiler can produce more efficient code with it. */ #ifdef WIN32 #ifdef LIBHDFS_DLL_EXPORT #define LIBHDFS_EXTERNAL __declspec(dllexport) #elif LIBHDFS_DLL_IMPORT #define LIBHDFS_EXTERNAL __declspec(dllimport) #else #define LIBHDFS_EXTERNAL #endif #else #ifdef LIBHDFS_DLL_EXPORT #define LIBHDFS_EXTERNAL __attribute__((visibility("default"))) #elif LIBHDFS_DLL_IMPORT #define LIBHDFS_EXTERNAL __attribute__((visibility("default"))) #else #define LIBHDFS_EXTERNAL #endif #endif #ifndef O_RDONLY #define O_RDONLY 1 #endif #ifndef O_WRONLY #define O_WRONLY 2 #endif #ifndef EINTERNAL #define EINTERNAL 255 #endif #define ELASTIC_BYTE_BUFFER_POOL_CLASS \ "org/apache/hadoop/io/ElasticByteBufferPool" /** All APIs set errno to meaningful values */ #ifdef __cplusplus extern "C" { #endif /** * Some utility decls used in libhdfs. */ struct hdfsBuilder; typedef int32_t tSize; /// size of data for read/write io ops typedef time_t tTime; /// time type in seconds typedef int64_t tOffset;/// offset within the file typedef uint16_t tPort; /// port typedef enum tObjectKind { kObjectKindFile = 'F', kObjectKindDirectory = 'D', } tObjectKind; /** * The C reflection of org.apache.org.hadoop.FileSystem . */ struct hdfs_internal; typedef struct hdfs_internal* hdfsFS; struct hdfsFile_internal; typedef struct hdfsFile_internal* hdfsFile; struct hadoopRzOptions; struct hadoopRzBuffer; /** * Determine if a file is open for read. * * @param file The HDFS file * @return 1 if the file is open for read; 0 otherwise */ LIBHDFS_EXTERNAL int hdfsFileIsOpenForRead(hdfsFile file); /** * Determine if a file is open for write. * * @param file The HDFS file * @return 1 if the file is open for write; 0 otherwise */ LIBHDFS_EXTERNAL int hdfsFileIsOpenForWrite(hdfsFile file); struct hdfsReadStatistics { uint64_t totalBytesRead; uint64_t totalLocalBytesRead; uint64_t totalShortCircuitBytesRead; uint64_t totalZeroCopyBytesRead; }; /** * Get read statistics about a file. This is only applicable to files * opened for reading. * * @param file The HDFS file * @param stats (out parameter) on a successful return, the read * statistics. Unchanged otherwise. You must free the * returned statistics with hdfsFileFreeReadStatistics. * @return 0 if the statistics were successfully returned, * -1 otherwise. On a failure, please check errno against * ENOTSUP. webhdfs, LocalFilesystem, and so forth may * not support read statistics. */ LIBHDFS_EXTERNAL int hdfsFileGetReadStatistics(hdfsFile file, struct hdfsReadStatistics **stats); /** * @param stats HDFS read statistics for a file. * * @return the number of remote bytes read. */ LIBHDFS_EXTERNAL int64_t hdfsReadStatisticsGetRemoteBytesRead( const struct hdfsReadStatistics *stats); /** * Clear the read statistics for a file. * * @param file The file to clear the read statistics of. * * @return 0 on success; the error code otherwise. * EINVAL: the file is not open for reading. * ENOTSUP: the file does not support clearing the read * statistics. * Errno will also be set to this code on failure. */ LIBHDFS_EXTERNAL int hdfsFileClearReadStatistics(hdfsFile file); /** * Free some HDFS read statistics. * * @param stats The HDFS read statistics to free. */ LIBHDFS_EXTERNAL void hdfsFileFreeReadStatistics(struct hdfsReadStatistics *stats); /** * hdfsConnectAsUser - Connect to a hdfs file system as a specific user * Connect to the hdfs. * @param nn The NameNode. See hdfsBuilderSetNameNode for details. * @param port The port on which the server is listening. * @param user the user name (this is hadoop domain user). Or NULL is equivelant to hhdfsConnect(host, port) * @return Returns a handle to the filesystem or NULL on error. * @deprecated Use hdfsBuilderConnect instead. */ LIBHDFS_EXTERNAL hdfsFS hdfsConnectAsUser(const char* nn, tPort port, const char *user); /** * hdfsConnect - Connect to a hdfs file system. * Connect to the hdfs. * @param nn The NameNode. See hdfsBuilderSetNameNode for details. * @param port The port on which the server is listening. * @return Returns a handle to the filesystem or NULL on error. * @deprecated Use hdfsBuilderConnect instead. */ LIBHDFS_EXTERNAL hdfsFS hdfsConnect(const char* nn, tPort port); /** * hdfsConnect - Connect to an hdfs file system. * * Forces a new instance to be created * * @param nn The NameNode. See hdfsBuilderSetNameNode for details. * @param port The port on which the server is listening. * @param user The user name to use when connecting * @return Returns a handle to the filesystem or NULL on error. * @deprecated Use hdfsBuilderConnect instead. */ LIBHDFS_EXTERNAL hdfsFS hdfsConnectAsUserNewInstance(const char* nn, tPort port, const char *user ); /** * hdfsConnect - Connect to an hdfs file system. * * Forces a new instance to be created * * @param nn The NameNode. See hdfsBuilderSetNameNode for details. * @param port The port on which the server is listening. * @return Returns a handle to the filesystem or NULL on error. * @deprecated Use hdfsBuilderConnect instead. */ LIBHDFS_EXTERNAL hdfsFS hdfsConnectNewInstance(const char* nn, tPort port); /** * Connect to HDFS using the parameters defined by the builder. * * The HDFS builder will be freed, whether or not the connection was * successful. * * Every successful call to hdfsBuilderConnect should be matched with a call * to hdfsDisconnect, when the hdfsFS is no longer needed. * * @param bld The HDFS builder * @return Returns a handle to the filesystem, or NULL on error. */ LIBHDFS_EXTERNAL hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld); /** * Create an HDFS builder. * * @return The HDFS builder, or NULL on error. */ LIBHDFS_EXTERNAL struct hdfsBuilder *hdfsNewBuilder(void); /** * Force the builder to always create a new instance of the FileSystem, * rather than possibly finding one in the cache. * * @param bld The HDFS builder */ LIBHDFS_EXTERNAL void hdfsBuilderSetForceNewInstance(struct hdfsBuilder *bld); /** * Set the HDFS NameNode to connect to. * * @param bld The HDFS builder * @param nn The NameNode to use. * * If the string given is 'default', the default NameNode * configuration will be used (from the XML configuration files) * * If NULL is given, a LocalFileSystem will be created. * * If the string starts with a protocol type such as file:// or * hdfs://, this protocol type will be used. If not, the * hdfs:// protocol type will be used. * * You may specify a NameNode port in the usual way by * passing a string of the format hdfs://:. * Alternately, you may set the port with * hdfsBuilderSetNameNodePort. However, you must not pass the * port in two different ways. */ LIBHDFS_EXTERNAL void hdfsBuilderSetNameNode(struct hdfsBuilder *bld, const char *nn); /** * Set the port of the HDFS NameNode to connect to. * * @param bld The HDFS builder * @param port The port. */ LIBHDFS_EXTERNAL void hdfsBuilderSetNameNodePort(struct hdfsBuilder *bld, tPort port); /** * Set the username to use when connecting to the HDFS cluster. * * @param bld The HDFS builder * @param userName The user name. The string will be shallow-copied. */ LIBHDFS_EXTERNAL void hdfsBuilderSetUserName(struct hdfsBuilder *bld, const char *userName); /** * Set the path to the Kerberos ticket cache to use when connecting to * the HDFS cluster. * * @param bld The HDFS builder * @param kerbTicketCachePath The Kerberos ticket cache path. The string * will be shallow-copied. */ LIBHDFS_EXTERNAL void hdfsBuilderSetKerbTicketCachePath(struct hdfsBuilder *bld, const char *kerbTicketCachePath); /** * Free an HDFS builder. * * It is normally not necessary to call this function since * hdfsBuilderConnect frees the builder. * * @param bld The HDFS builder */ LIBHDFS_EXTERNAL void hdfsFreeBuilder(struct hdfsBuilder *bld); /** * Set a configuration string for an HdfsBuilder. * * @param key The key to set. * @param val The value, or NULL to set no value. * This will be shallow-copied. You are responsible for * ensuring that it remains valid until the builder is * freed. * * @return 0 on success; nonzero error code otherwise. */ LIBHDFS_EXTERNAL int hdfsBuilderConfSetStr(struct hdfsBuilder *bld, const char *key, const char *val); /** * Get a configuration string. * * @param key The key to find * @param val (out param) The value. This will be set to NULL if the * key isn't found. You must free this string with * hdfsConfStrFree. * * @return 0 on success; nonzero error code otherwise. * Failure to find the key is not an error. */ LIBHDFS_EXTERNAL int hdfsConfGetStr(const char *key, char **val); /** * Get a configuration integer. * * @param key The key to find * @param val (out param) The value. This will NOT be changed if the * key isn't found. * * @return 0 on success; nonzero error code otherwise. * Failure to find the key is not an error. */ LIBHDFS_EXTERNAL int hdfsConfGetInt(const char *key, int32_t *val); /** * Free a configuration string found with hdfsConfGetStr. * * @param val A configuration string obtained from hdfsConfGetStr */ LIBHDFS_EXTERNAL void hdfsConfStrFree(char *val); /** * hdfsDisconnect - Disconnect from the hdfs file system. * Disconnect from hdfs. * @param fs The configured filesystem handle. * @return Returns 0 on success, -1 on error. * Even if there is an error, the resources associated with the * hdfsFS will be freed. */ LIBHDFS_EXTERNAL int hdfsDisconnect(hdfsFS fs); /** * hdfsOpenFile - Open a hdfs file in given mode. * @param fs The configured filesystem handle. * @param path The full path to the file. * @param flags - an | of bits/fcntl.h file flags - supported flags are O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT), * O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR || (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP. * @param bufferSize Size of buffer for read/write - pass 0 if you want * to use the default configured values. * @param replication Block replication - pass 0 if you want to use * the default configured values. * @param blocksize Size of block - pass 0 if you want to use the * default configured values. * @return Returns the handle to the open file or NULL on error. */ LIBHDFS_EXTERNAL hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags, int bufferSize, short replication, tSize blocksize); /** * hdfsTruncateFile - Truncate a hdfs file to given lenght. * @param fs The configured filesystem handle. * @param path The full path to the file. * @param newlength The size the file is to be truncated to * @return 1 if the file has been truncated to the desired newlength * and is immediately available to be reused for write operations * such as append. * 0 if a background process of adjusting the length of the last * block has been started, and clients should wait for it to * complete before proceeding with further file updates. * -1 on error. */ LIBHDFS_EXTERNAL int hdfsTruncateFile(hdfsFS fs, const char* path, tOffset newlength); /** * hdfsUnbufferFile - Reduce the buffering done on a file. * * @param file The file to unbuffer. * @return 0 on success * ENOTSUP if the file does not support unbuffering * Errno will also be set to this value. */ LIBHDFS_EXTERNAL int hdfsUnbufferFile(hdfsFile file); /** * hdfsCloseFile - Close an open file. * @param fs The configured filesystem handle. * @param file The file handle. * @return Returns 0 on success, -1 on error. * On error, errno will be set appropriately. * If the hdfs file was valid, the memory associated with it will * be freed at the end of this call, even if there was an I/O * error. */ LIBHDFS_EXTERNAL int hdfsCloseFile(hdfsFS fs, hdfsFile file); /** * hdfsExists - Checks if a given path exsits on the filesystem * @param fs The configured filesystem handle. * @param path The path to look for * @return Returns 0 on success, -1 on error. */ LIBHDFS_EXTERNAL int hdfsExists(hdfsFS fs, const char *path); /** * hdfsSeek - Seek to given offset in file. * This works only for files opened in read-only mode. * @param fs The configured filesystem handle. * @param file The file handle. * @param desiredPos Offset into the file to seek into. * @return Returns 0 on success, -1 on error. */ LIBHDFS_EXTERNAL int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos); /** * hdfsTell - Get the current offset in the file, in bytes. * @param fs The configured filesystem handle. * @param file The file handle. * @return Current offset, -1 on error. */ LIBHDFS_EXTERNAL tOffset hdfsTell(hdfsFS fs, hdfsFile file); /** * hdfsRead - Read data from an open file. * @param fs The configured filesystem handle. * @param file The file handle. * @param buffer The buffer to copy read bytes into. * @param length The length of the buffer. * @return On success, a positive number indicating how many bytes * were read. * On end-of-file, 0. * On error, -1. Errno will be set to the error code. * Just like the POSIX read function, hdfsRead will return -1 * and set errno to EINTR if data is temporarily unavailable, * but we are not yet at the end of the file. */ LIBHDFS_EXTERNAL tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length); /** * hdfsPread - Positional read of data from an open file. * @param fs The configured filesystem handle. * @param file The file handle. * @param position Position from which to read * @param buffer The buffer to copy read bytes into. * @param length The length of the buffer. * @return See hdfsRead */ LIBHDFS_EXTERNAL tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position, void* buffer, tSize length); /** * hdfsWrite - Write data into an open file. * @param fs The configured filesystem handle. * @param file The file handle. * @param buffer The data. * @param length The no. of bytes to write. * @return Returns the number of bytes written, -1 on error. */ LIBHDFS_EXTERNAL tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer, tSize length); /** * hdfsWrite - Flush the data. * @param fs The configured filesystem handle. * @param file The file handle. * @return Returns 0 on success, -1 on error. */ LIBHDFS_EXTERNAL int hdfsFlush(hdfsFS fs, hdfsFile file); /** * hdfsHFlush - Flush out the data in client's user buffer. After the * return of this call, new readers will see the data. * @param fs configured filesystem handle * @param file file handle * @return 0 on success, -1 on error and sets errno */ LIBHDFS_EXTERNAL int hdfsHFlush(hdfsFS fs, hdfsFile file); /** * hdfsHSync - Similar to posix fsync, Flush out the data in client's * user buffer. all the way to the disk device (but the disk may have * it in its cache). * @param fs configured filesystem handle * @param file file handle * @return 0 on success, -1 on error and sets errno */ LIBHDFS_EXTERNAL int hdfsHSync(hdfsFS fs, hdfsFile file); /** * hdfsAvailable - Number of bytes that can be read from this * input stream without blocking. * @param fs The configured filesystem handle. * @param file The file handle. * @return Returns available bytes; -1 on error. */ LIBHDFS_EXTERNAL int hdfsAvailable(hdfsFS fs, hdfsFile file); /** * hdfsCopy - Copy file from one filesystem to another. * @param srcFS The handle to source filesystem. * @param src The path of source file. * @param dstFS The handle to destination filesystem. * @param dst The path of destination file. * @return Returns 0 on success, -1 on error. */ LIBHDFS_EXTERNAL int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); /** * hdfsMove - Move file from one filesystem to another. * @param srcFS The handle to source filesystem. * @param src The path of source file. * @param dstFS The handle to destination filesystem. * @param dst The path of destination file. * @return Returns 0 on success, -1 on error. */ LIBHDFS_EXTERNAL int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); /** * hdfsDelete - Delete file. * @param fs The configured filesystem handle. * @param path The path of the file. * @param recursive if path is a directory and set to * non-zero, the directory is deleted else throws an exception. In * case of a file the recursive argument is irrelevant. * @return Returns 0 on success, -1 on error. */ LIBHDFS_EXTERNAL int hdfsDelete(hdfsFS fs, const char* path, int recursive); /** * hdfsRename - Rename file. * @param fs The configured filesystem handle. * @param oldPath The path of the source file. * @param newPath The path of the destination file. * @return Returns 0 on success, -1 on error. */ LIBHDFS_EXTERNAL int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath); /** * hdfsGetWorkingDirectory - Get the current working directory for * the given filesystem. * @param fs The configured filesystem handle. * @param buffer The user-buffer to copy path of cwd into. * @param bufferSize The length of user-buffer. * @return Returns buffer, NULL on error. */ LIBHDFS_EXTERNAL char* hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize); /** * hdfsSetWorkingDirectory - Set the working directory. All relative * paths will be resolved relative to it. * @param fs The configured filesystem handle. * @param path The path of the new 'cwd'. * @return Returns 0 on success, -1 on error. */ LIBHDFS_EXTERNAL int hdfsSetWorkingDirectory(hdfsFS fs, const char* path); /** * hdfsCreateDirectory - Make the given file and all non-existent * parents into directories. * @param fs The configured filesystem handle. * @param path The path of the directory. * @return Returns 0 on success, -1 on error. */ LIBHDFS_EXTERNAL int hdfsCreateDirectory(hdfsFS fs, const char* path); /** * hdfsSetReplication - Set the replication of the specified * file to the supplied value * @param fs The configured filesystem handle. * @param path The path of the file. * @return Returns 0 on success, -1 on error. */ LIBHDFS_EXTERNAL int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication); /** * hdfsFileInfo - Information about a file/directory. */ typedef struct { tObjectKind mKind; /* file or directory */ char *mName; /* the name of the file */ tTime mLastMod; /* the last modification time for the file in seconds */ tOffset mSize; /* the size of the file in bytes */ short mReplication; /* the count of replicas */ tOffset mBlockSize; /* the block size for the file */ char *mOwner; /* the owner of the file */ char *mGroup; /* the group associated with the file */ short mPermissions; /* the permissions associated with the file */ tTime mLastAccess; /* the last access time for the file in seconds */ } hdfsFileInfo; /** * hdfsListDirectory - Get list of files/directories for a given * directory-path. hdfsFreeFileInfo should be called to deallocate memory. * @param fs The configured filesystem handle. * @param path The path of the directory. * @param numEntries Set to the number of files/directories in path. * @return Returns a dynamically-allocated array of hdfsFileInfo * objects; NULL on error or empty directory. * errno is set to non-zero on error or zero on success. */ LIBHDFS_EXTERNAL hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char* path, int *numEntries); /** * hdfsGetPathInfo - Get information about a path as a (dynamically * allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be * called when the pointer is no longer needed. * @param fs The configured filesystem handle. * @param path The path of the file. * @return Returns a dynamically-allocated hdfsFileInfo object; * NULL on error. */ LIBHDFS_EXTERNAL hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path); /** * hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields) * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo * objects. * @param numEntries The size of the array. */ LIBHDFS_EXTERNAL void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries); /** * hdfsFileIsEncrypted: determine if a file is encrypted based on its * hdfsFileInfo. * @return -1 if there was an error (errno will be set), 0 if the file is * not encrypted, 1 if the file is encrypted. */ LIBHDFS_EXTERNAL int hdfsFileIsEncrypted(hdfsFileInfo *hdfsFileInfo); /** * hdfsGetHosts - Get hostnames where a particular block (determined by * pos & blocksize) of a file is stored. The last element in the array * is NULL. Due to replication, a single block could be present on * multiple hosts. * @param fs The configured filesystem handle. * @param path The path of the file. * @param start The start of the block. * @param length The length of the block. * @return Returns a dynamically-allocated 2-d array of blocks-hosts; * NULL on error. */ LIBHDFS_EXTERNAL char*** hdfsGetHosts(hdfsFS fs, const char* path, tOffset start, tOffset length); /** * hdfsFreeHosts - Free up the structure returned by hdfsGetHosts * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo * objects. * @param numEntries The size of the array. */ LIBHDFS_EXTERNAL void hdfsFreeHosts(char ***blockHosts); /** * hdfsGetDefaultBlockSize - Get the default blocksize. * * @param fs The configured filesystem handle. * @deprecated Use hdfsGetDefaultBlockSizeAtPath instead. * * @return Returns the default blocksize, or -1 on error. */ LIBHDFS_EXTERNAL tOffset hdfsGetDefaultBlockSize(hdfsFS fs); /** * hdfsGetDefaultBlockSizeAtPath - Get the default blocksize at the * filesystem indicated by a given path. * * @param fs The configured filesystem handle. * @param path The given path will be used to locate the actual * filesystem. The full path does not have to exist. * * @return Returns the default blocksize, or -1 on error. */ LIBHDFS_EXTERNAL tOffset hdfsGetDefaultBlockSizeAtPath(hdfsFS fs, const char *path); /** * hdfsGetCapacity - Return the raw capacity of the filesystem. * @param fs The configured filesystem handle. * @return Returns the raw-capacity; -1 on error. */ LIBHDFS_EXTERNAL tOffset hdfsGetCapacity(hdfsFS fs); /** * hdfsGetUsed - Return the total raw size of all files in the filesystem. * @param fs The configured filesystem handle. * @return Returns the total-size; -1 on error. */ LIBHDFS_EXTERNAL tOffset hdfsGetUsed(hdfsFS fs); /** * Change the user and/or group of a file or directory. * * @param fs The configured filesystem handle. * @param path the path to the file or directory * @param owner User string. Set to NULL for 'no change' * @param group Group string. Set to NULL for 'no change' * @return 0 on success else -1 */ LIBHDFS_EXTERNAL int hdfsChown(hdfsFS fs, const char* path, const char *owner, const char *group); /** * hdfsChmod * @param fs The configured filesystem handle. * @param path the path to the file or directory * @param mode the bitmask to set it to * @return 0 on success else -1 */ LIBHDFS_EXTERNAL int hdfsChmod(hdfsFS fs, const char* path, short mode); /** * hdfsUtime * @param fs The configured filesystem handle. * @param path the path to the file or directory * @param mtime new modification time or -1 for no change * @param atime new access time or -1 for no change * @return 0 on success else -1 */ LIBHDFS_EXTERNAL int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime); /** * Allocate a zero-copy options structure. * * You must free all options structures allocated with this function using * hadoopRzOptionsFree. * * @return A zero-copy options structure, or NULL if one could * not be allocated. If NULL is returned, errno will * contain the error number. */ LIBHDFS_EXTERNAL struct hadoopRzOptions *hadoopRzOptionsAlloc(void); /** * Determine whether we should skip checksums in read0. * * @param opts The options structure. * @param skip Nonzero to skip checksums sometimes; zero to always * check them. * * @return 0 on success; -1 plus errno on failure. */ LIBHDFS_EXTERNAL int hadoopRzOptionsSetSkipChecksum( struct hadoopRzOptions *opts, int skip); /** * Set the ByteBufferPool to use with read0. * * @param opts The options structure. * @param className If this is NULL, we will not use any * ByteBufferPool. If this is non-NULL, it will be * treated as the name of the pool class to use. * For example, you can use * ELASTIC_BYTE_BUFFER_POOL_CLASS. * * @return 0 if the ByteBufferPool class was found and * instantiated; * -1 plus errno otherwise. */ LIBHDFS_EXTERNAL int hadoopRzOptionsSetByteBufferPool( struct hadoopRzOptions *opts, const char *className); /** * Free a hadoopRzOptionsFree structure. * * @param opts The options structure to free. * Any associated ByteBufferPool will also be freed. */ LIBHDFS_EXTERNAL void hadoopRzOptionsFree(struct hadoopRzOptions *opts); /** * Perform a byte buffer read. * If possible, this will be a zero-copy (mmap) read. * * @param file The file to read from. * @param opts An options structure created by hadoopRzOptionsAlloc. * @param maxLength The maximum length to read. We may read fewer bytes * than this length. * * @return On success, we will return a new hadoopRzBuffer. * This buffer will continue to be valid and readable * until it is released by readZeroBufferFree. Failure to * release a buffer will lead to a memory leak. * You can access the data within the hadoopRzBuffer with * hadoopRzBufferGet. If you have reached EOF, the data * within the hadoopRzBuffer will be NULL. You must still * free hadoopRzBuffer instances containing NULL. * * On failure, we will return NULL plus an errno code. * errno = EOPNOTSUPP indicates that we could not do a * zero-copy read, and there was no ByteBufferPool * supplied. */ LIBHDFS_EXTERNAL struct hadoopRzBuffer* hadoopReadZero(hdfsFile file, struct hadoopRzOptions *opts, int32_t maxLength); /** * Determine the length of the buffer returned from readZero. * * @param buffer a buffer returned from readZero. * @return the length of the buffer. */ LIBHDFS_EXTERNAL int32_t hadoopRzBufferLength(const struct hadoopRzBuffer *buffer); /** * Get a pointer to the raw buffer returned from readZero. * * To find out how many bytes this buffer contains, call * hadoopRzBufferLength. * * @param buffer a buffer returned from readZero. * @return a pointer to the start of the buffer. This will be * NULL when end-of-file has been reached. */ LIBHDFS_EXTERNAL const void *hadoopRzBufferGet(const struct hadoopRzBuffer *buffer); /** * Release a buffer obtained through readZero. * * @param file The hdfs stream that created this buffer. This must be * the same stream you called hadoopReadZero on. * @param buffer The buffer to release. */ LIBHDFS_EXTERNAL void hadoopRzBufferFree(hdfsFile file, struct hadoopRzBuffer *buffer); #ifdef __cplusplus } #endif #undef LIBHDFS_EXTERNAL #endif /*LIBHDFS_HDFS_H*/ /** * vim: ts=4: sw=4: et */ ================================================ FILE: src/libhdfs/jni_helper.c ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "config.h" #include "exception.h" #include "jni_helper.h" #include "platform.h" #include "common/htable.h" #include "os/mutexes.h" #include "os/thread_local_storage.h" #include #include static struct htable *gClassRefHTable = NULL; /** The Native return types that methods could return */ #define JVOID 'V' #define JOBJECT 'L' #define JARRAYOBJECT '[' #define JBOOLEAN 'Z' #define JBYTE 'B' #define JCHAR 'C' #define JSHORT 'S' #define JINT 'I' #define JLONG 'J' #define JFLOAT 'F' #define JDOUBLE 'D' /** * MAX_HASH_TABLE_ELEM: The maximum no. of entries in the hashtable. * It's set to 4096 to account for (classNames + No. of threads) */ #define MAX_HASH_TABLE_ELEM 4096 /** * Length of buffer for retrieving created JVMs. (We only ever create one.) */ #define VM_BUF_LENGTH 1 void destroyLocalReference(JNIEnv *env, jobject jObject) { if (jObject) (*env)->DeleteLocalRef(env, jObject); } static jthrowable validateMethodType(JNIEnv *env, MethType methType) { if (methType != STATIC && methType != INSTANCE) { return newRuntimeError(env, "validateMethodType(methType=%d): " "illegal method type.\n", methType); } return NULL; } jthrowable newJavaStr(JNIEnv *env, const char *str, jstring *out) { jstring jstr; if (!str) { /* Can't pass NULL to NewStringUTF: the result would be * implementation-defined. */ *out = NULL; return NULL; } jstr = (*env)->NewStringUTF(env, str); if (!jstr) { /* If NewStringUTF returns NULL, an exception has been thrown, * which we need to handle. Probaly an OOM. */ return getPendingExceptionAndClear(env); } *out = jstr; return NULL; } jthrowable newCStr(JNIEnv *env, jstring jstr, char **out) { const char *tmp; if (!jstr) { *out = NULL; return NULL; } tmp = (*env)->GetStringUTFChars(env, jstr, NULL); if (!tmp) { return getPendingExceptionAndClear(env); } *out = strdup(tmp); (*env)->ReleaseStringUTFChars(env, jstr, tmp); return NULL; } jthrowable invokeMethod(JNIEnv *env, jvalue *retval, MethType methType, jobject instObj, const char *className, const char *methName, const char *methSignature, ...) { va_list args; jclass cls; jmethodID mid; jthrowable jthr; const char *str; char returnType; jthr = validateMethodType(env, methType); if (jthr) return jthr; jthr = globalClassReference(className, env, &cls); if (jthr) return jthr; jthr = methodIdFromClass(className, methName, methSignature, methType, env, &mid); if (jthr) return jthr; str = methSignature; while (*str != ')') str++; str++; returnType = *str; va_start(args, methSignature); if (returnType == JOBJECT || returnType == JARRAYOBJECT) { jobject jobj = NULL; if (methType == STATIC) { jobj = (*env)->CallStaticObjectMethodV(env, cls, mid, args); } else if (methType == INSTANCE) { jobj = (*env)->CallObjectMethodV(env, instObj, mid, args); } retval->l = jobj; } else if (returnType == JVOID) { if (methType == STATIC) { (*env)->CallStaticVoidMethodV(env, cls, mid, args); } else if (methType == INSTANCE) { (*env)->CallVoidMethodV(env, instObj, mid, args); } } else if (returnType == JBOOLEAN) { jboolean jbool = 0; if (methType == STATIC) { jbool = (*env)->CallStaticBooleanMethodV(env, cls, mid, args); } else if (methType == INSTANCE) { jbool = (*env)->CallBooleanMethodV(env, instObj, mid, args); } retval->z = jbool; } else if (returnType == JSHORT) { jshort js = 0; if (methType == STATIC) { js = (*env)->CallStaticShortMethodV(env, cls, mid, args); } else if (methType == INSTANCE) { js = (*env)->CallShortMethodV(env, instObj, mid, args); } retval->s = js; } else if (returnType == JLONG) { jlong jl = -1; if (methType == STATIC) { jl = (*env)->CallStaticLongMethodV(env, cls, mid, args); } else if (methType == INSTANCE) { jl = (*env)->CallLongMethodV(env, instObj, mid, args); } retval->j = jl; } else if (returnType == JINT) { jint ji = -1; if (methType == STATIC) { ji = (*env)->CallStaticIntMethodV(env, cls, mid, args); } else if (methType == INSTANCE) { ji = (*env)->CallIntMethodV(env, instObj, mid, args); } retval->i = ji; } va_end(args); jthr = (*env)->ExceptionOccurred(env); if (jthr) { (*env)->ExceptionClear(env); return jthr; } return NULL; } jthrowable constructNewObjectOfClass(JNIEnv *env, jobject *out, const char *className, const char *ctorSignature, ...) { va_list args; jclass cls; jmethodID mid; jobject jobj; jthrowable jthr; jthr = globalClassReference(className, env, &cls); if (jthr) return jthr; jthr = methodIdFromClass(className, "", ctorSignature, INSTANCE, env, &mid); if (jthr) return jthr; va_start(args, ctorSignature); jobj = (*env)->NewObjectV(env, cls, mid, args); va_end(args); if (!jobj) return getPendingExceptionAndClear(env); *out = jobj; return NULL; } jthrowable methodIdFromClass(const char *className, const char *methName, const char *methSignature, MethType methType, JNIEnv *env, jmethodID *out) { jclass cls; jthrowable jthr; jmethodID mid = 0; jthr = globalClassReference(className, env, &cls); if (jthr) return jthr; jthr = validateMethodType(env, methType); if (jthr) return jthr; if (methType == STATIC) { mid = (*env)->GetStaticMethodID(env, cls, methName, methSignature); } else if (methType == INSTANCE) { mid = (*env)->GetMethodID(env, cls, methName, methSignature); } if (mid == NULL) { fprintf(stderr, "could not find method %s from class %s with " "signature %s\n", methName, className, methSignature); return getPendingExceptionAndClear(env); } *out = mid; return NULL; } jthrowable globalClassReference(const char *className, JNIEnv *env, jclass *out) { jthrowable jthr = NULL; jclass local_clazz = NULL; jclass clazz = NULL; int ret; mutexLock(&hdfsHashMutex); if (!gClassRefHTable) { gClassRefHTable = htable_alloc(MAX_HASH_TABLE_ELEM, ht_hash_string, ht_compare_string); if (!gClassRefHTable) { jthr = newRuntimeError(env, "htable_alloc failed\n"); goto done; } } clazz = htable_get(gClassRefHTable, className); if (clazz) { *out = clazz; goto done; } local_clazz = (*env)->FindClass(env,className); if (!local_clazz) { jthr = getPendingExceptionAndClear(env); goto done; } clazz = (*env)->NewGlobalRef(env, local_clazz); if (!clazz) { jthr = getPendingExceptionAndClear(env); goto done; } ret = htable_put(gClassRefHTable, (void*)className, clazz); if (ret) { jthr = newRuntimeError(env, "htable_put failed with error " "code %d\n", ret); goto done; } *out = clazz; jthr = NULL; done: mutexUnlock(&hdfsHashMutex); (*env)->DeleteLocalRef(env, local_clazz); if (jthr && clazz) { (*env)->DeleteGlobalRef(env, clazz); } return jthr; } jthrowable classNameOfObject(jobject jobj, JNIEnv *env, char **name) { jthrowable jthr; jclass cls, clsClass = NULL; jmethodID mid; jstring str = NULL; const char *cstr = NULL; char *newstr; cls = (*env)->GetObjectClass(env, jobj); if (cls == NULL) { jthr = getPendingExceptionAndClear(env); goto done; } clsClass = (*env)->FindClass(env, "java/lang/Class"); if (clsClass == NULL) { jthr = getPendingExceptionAndClear(env); goto done; } mid = (*env)->GetMethodID(env, clsClass, "getName", "()Ljava/lang/String;"); if (mid == NULL) { jthr = getPendingExceptionAndClear(env); goto done; } str = (*env)->CallObjectMethod(env, cls, mid); if (str == NULL) { jthr = getPendingExceptionAndClear(env); goto done; } cstr = (*env)->GetStringUTFChars(env, str, NULL); if (!cstr) { jthr = getPendingExceptionAndClear(env); goto done; } newstr = strdup(cstr); if (newstr == NULL) { jthr = newRuntimeError(env, "classNameOfObject: out of memory"); goto done; } *name = newstr; jthr = NULL; done: destroyLocalReference(env, cls); destroyLocalReference(env, clsClass); if (str) { if (cstr) (*env)->ReleaseStringUTFChars(env, str, cstr); (*env)->DeleteLocalRef(env, str); } return jthr; } /** * Get the global JNI environemnt. * * We only have to create the JVM once. After that, we can use it in * every thread. You must be holding the jvmMutex when you call this * function. * * @return The JNIEnv on success; error code otherwise */ static JNIEnv* getGlobalJNIEnv(void) { JavaVM* vmBuf[VM_BUF_LENGTH]; JNIEnv *env; jint rv = 0; jint noVMs = 0; jthrowable jthr; char *hadoopClassPath; const char *hadoopClassPathVMArg = "-Djava.class.path="; size_t optHadoopClassPathLen; char *optHadoopClassPath; int noArgs = 1; char *hadoopJvmArgs; char jvmArgDelims[] = " "; char *str, *token, *savePtr; JavaVMInitArgs vm_args; JavaVM *vm; JavaVMOption *options; rv = JNI_GetCreatedJavaVMs(&(vmBuf[0]), VM_BUF_LENGTH, &noVMs); if (rv != 0) { fprintf(stderr, "JNI_GetCreatedJavaVMs failed with error: %d\n", rv); return NULL; } if (noVMs == 0) { //Get the environment variables for initializing the JVM hadoopClassPath = getenv("CLASSPATH"); if (hadoopClassPath == NULL) { fprintf(stderr, "Environment variable CLASSPATH not set!\n"); return NULL; } optHadoopClassPathLen = strlen(hadoopClassPath) + strlen(hadoopClassPathVMArg) + 1; optHadoopClassPath = malloc(sizeof(char)*optHadoopClassPathLen); snprintf(optHadoopClassPath, optHadoopClassPathLen, "%s%s", hadoopClassPathVMArg, hadoopClassPath); // Determine the # of LIBHDFS_OPTS args hadoopJvmArgs = getenv("LIBHDFS_OPTS"); if (hadoopJvmArgs != NULL) { hadoopJvmArgs = strdup(hadoopJvmArgs); for (noArgs = 1, str = hadoopJvmArgs; ; noArgs++, str = NULL) { token = strtok_r(str, jvmArgDelims, &savePtr); if (NULL == token) { break; } } free(hadoopJvmArgs); } // Now that we know the # args, populate the options array options = calloc(noArgs, sizeof(JavaVMOption)); if (!options) { fputs("Call to calloc failed\n", stderr); free(optHadoopClassPath); return NULL; } options[0].optionString = optHadoopClassPath; hadoopJvmArgs = getenv("LIBHDFS_OPTS"); if (hadoopJvmArgs != NULL) { hadoopJvmArgs = strdup(hadoopJvmArgs); for (noArgs = 1, str = hadoopJvmArgs; ; noArgs++, str = NULL) { token = strtok_r(str, jvmArgDelims, &savePtr); if (NULL == token) { break; } options[noArgs].optionString = token; } } //Create the VM vm_args.version = JNI_VERSION_1_2; vm_args.options = options; vm_args.nOptions = noArgs; vm_args.ignoreUnrecognized = 1; rv = JNI_CreateJavaVM(&vm, (void*)&env, &vm_args); if (hadoopJvmArgs != NULL) { free(hadoopJvmArgs); } free(optHadoopClassPath); free(options); if (rv != 0) { fprintf(stderr, "Call to JNI_CreateJavaVM failed " "with error: %d\n", rv); return NULL; } jthr = invokeMethod(env, NULL, STATIC, NULL, "org/apache/hadoop/fs/FileSystem", "loadFileSystems", "()V"); if (jthr) { printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "loadFileSystems"); } } else { //Attach this thread to the VM vm = vmBuf[0]; rv = (*vm)->AttachCurrentThread(vm, (void*)&env, 0); if (rv != 0) { fprintf(stderr, "Call to AttachCurrentThread " "failed with error: %d\n", rv); return NULL; } } return env; } /** * getJNIEnv: A helper function to get the JNIEnv* for the given thread. * If no JVM exists, then one will be created. JVM command line arguments * are obtained from the LIBHDFS_OPTS environment variable. * * Implementation note: we rely on POSIX thread-local storage (tls). * This allows us to associate a destructor function with each thread, that * will detach the thread from the Java VM when the thread terminates. If we * failt to do this, it will cause a memory leak. * * However, POSIX TLS is not the most efficient way to do things. It requires a * key to be initialized before it can be used. Since we don't know if this key * is initialized at the start of this function, we have to lock a mutex first * and check. Luckily, most operating systems support the more efficient * __thread construct, which is initialized by the linker. * * @param: None. * @return The JNIEnv* corresponding to the thread. */ JNIEnv* getJNIEnv(void) { JNIEnv *env; THREAD_LOCAL_STORAGE_GET_QUICK(); mutexLock(&jvmMutex); if (threadLocalStorageGet(&env)) { mutexUnlock(&jvmMutex); return NULL; } if (env) { mutexUnlock(&jvmMutex); return env; } env = getGlobalJNIEnv(); mutexUnlock(&jvmMutex); if (!env) { fprintf(stderr, "getJNIEnv: getGlobalJNIEnv failed\n"); return NULL; } if (threadLocalStorageSet(env)) { return NULL; } THREAD_LOCAL_STORAGE_SET_QUICK(env); return env; } int javaObjectIsOfClass(JNIEnv *env, jobject obj, const char *name) { jclass clazz; int ret; clazz = (*env)->FindClass(env, name); if (!clazz) { printPendingExceptionAndFree(env, PRINT_EXC_ALL, "javaObjectIsOfClass(%s)", name); return -1; } ret = (*env)->IsInstanceOf(env, obj, clazz); (*env)->DeleteLocalRef(env, clazz); return ret == JNI_TRUE ? 1 : 0; } jthrowable hadoopConfSetStr(JNIEnv *env, jobject jConfiguration, const char *key, const char *value) { jthrowable jthr; jstring jkey = NULL, jvalue = NULL; jthr = newJavaStr(env, key, &jkey); if (jthr) goto done; jthr = newJavaStr(env, value, &jvalue); if (jthr) goto done; jthr = invokeMethod(env, NULL, INSTANCE, jConfiguration, "org/apache/hadoop/conf/Configuration", "set", "(Ljava/lang/String;Ljava/lang/String;)V", jkey, jvalue); if (jthr) goto done; done: (*env)->DeleteLocalRef(env, jkey); (*env)->DeleteLocalRef(env, jvalue); return jthr; } jthrowable fetchEnumInstance(JNIEnv *env, const char *className, const char *valueName, jobject *out) { jclass clazz; jfieldID fieldId; jobject jEnum; char prettyClass[256]; clazz = (*env)->FindClass(env, className); if (!clazz) { return newRuntimeError(env, "fetchEnum(%s, %s): failed to find class.", className, valueName); } if (snprintf(prettyClass, sizeof(prettyClass), "L%s;", className) >= sizeof(prettyClass)) { return newRuntimeError(env, "fetchEnum(%s, %s): class name too long.", className, valueName); } fieldId = (*env)->GetStaticFieldID(env, clazz, valueName, prettyClass); if (!fieldId) { return getPendingExceptionAndClear(env); } jEnum = (*env)->GetStaticObjectField(env, clazz, fieldId); if (!jEnum) { return getPendingExceptionAndClear(env); } *out = jEnum; return NULL; } ================================================ FILE: src/libhdfs/jni_helper.h ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBHDFS_JNI_HELPER_H #define LIBHDFS_JNI_HELPER_H #include #include #include #include #include #define PATH_SEPARATOR ':' /** Denote the method we want to invoke as STATIC or INSTANCE */ typedef enum { STATIC, INSTANCE } MethType; /** * Create a new malloc'ed C string from a Java string. * * @param env The JNI environment * @param jstr The Java string * @param out (out param) the malloc'ed C string * * @return NULL on success; the exception otherwise */ jthrowable newCStr(JNIEnv *env, jstring jstr, char **out); /** * Create a new Java string from a C string. * * @param env The JNI environment * @param str The C string * @param out (out param) the java string * * @return NULL on success; the exception otherwise */ jthrowable newJavaStr(JNIEnv *env, const char *str, jstring *out); /** * Helper function to destroy a local reference of java.lang.Object * @param env: The JNIEnv pointer. * @param jFile: The local reference of java.lang.Object object * @return None. */ void destroyLocalReference(JNIEnv *env, jobject jObject); /** invokeMethod: Invoke a Static or Instance method. * className: Name of the class where the method can be found * methName: Name of the method * methSignature: the signature of the method "(arg-types)ret-type" * methType: The type of the method (STATIC or INSTANCE) * instObj: Required if the methType is INSTANCE. The object to invoke the method on. * env: The JNIEnv pointer * retval: The pointer to a union type which will contain the result of the method invocation, e.g. if the method returns an Object, retval will be set to that, if the method returns boolean, retval will be set to the value (JNI_TRUE or JNI_FALSE), etc. * exc: If the methods throws any exception, this will contain the reference * Arguments (the method arguments) must be passed after methSignature * RETURNS: -1 on error and 0 on success. If -1 is returned, exc will have a valid exception reference, and the result stored at retval is undefined. */ jthrowable invokeMethod(JNIEnv *env, jvalue *retval, MethType methType, jobject instObj, const char *className, const char *methName, const char *methSignature, ...); jthrowable constructNewObjectOfClass(JNIEnv *env, jobject *out, const char *className, const char *ctorSignature, ...); jthrowable methodIdFromClass(const char *className, const char *methName, const char *methSignature, MethType methType, JNIEnv *env, jmethodID *out); jthrowable globalClassReference(const char *className, JNIEnv *env, jclass *out); /** classNameOfObject: Get an object's class name. * @param jobj: The object. * @param env: The JNIEnv pointer. * @param name: (out param) On success, will contain a string containing the * class name. This string must be freed by the caller. * @return NULL on success, or the exception */ jthrowable classNameOfObject(jobject jobj, JNIEnv *env, char **name); /** getJNIEnv: A helper function to get the JNIEnv* for the given thread. * If no JVM exists, then one will be created. JVM command line arguments * are obtained from the LIBHDFS_OPTS environment variable. * @param: None. * @return The JNIEnv* corresponding to the thread. * */ JNIEnv* getJNIEnv(void); /** * Figure out if a Java object is an instance of a particular class. * * @param env The Java environment. * @param obj The object to check. * @param name The class name to check. * * @return -1 if we failed to find the referenced class name. * 0 if the object is not of the given class. * 1 if the object is of the given class. */ int javaObjectIsOfClass(JNIEnv *env, jobject obj, const char *name); /** * Set a value in a configuration object. * * @param env The JNI environment * @param jConfiguration The configuration object to modify * @param key The key to modify * @param value The value to set the key to * * @return NULL on success; exception otherwise */ jthrowable hadoopConfSetStr(JNIEnv *env, jobject jConfiguration, const char *key, const char *value); /** * Fetch an instance of an Enum. * * @param env The JNI environment. * @param className The enum class name. * @param valueName The name of the enum value * @param out (out param) on success, a local reference to an * instance of the enum object. (Since Java enums are * singletons, this is also the only instance.) * * @return NULL on success; exception otherwise */ jthrowable fetchEnumInstance(JNIEnv *env, const char *className, const char *valueName, jobject *out); #endif /*LIBHDFS_JNI_HELPER_H*/ /** * vim: ts=4: sw=4: et: */ ================================================ FILE: src/libhdfs/os/mutexes.h ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBHDFS_MUTEXES_H #define LIBHDFS_MUTEXES_H /* * Defines abstraction over platform-specific mutexes. libhdfs has no formal * initialization function that users would call from a single-threaded context * to initialize the library. This creates a challenge for bootstrapping the * mutexes. To address this, all required mutexes are pre-defined here with * external storage. Platform-specific implementations must guarantee that the * mutexes are initialized via static initialization. */ #include "platform.h" /** Mutex protecting the class reference hash table. */ extern mutex hdfsHashMutex; /** Mutex protecting singleton JVM instance. */ extern mutex jvmMutex; /** * Locks a mutex. * * @param m mutex * @return 0 if successful, non-zero otherwise */ int mutexLock(mutex *m); /** * Unlocks a mutex. * * @param m mutex * @return 0 if successful, non-zero otherwise */ int mutexUnlock(mutex *m); #endif ================================================ FILE: src/libhdfs/os/posix/mutexes.c ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "os/mutexes.h" #include #include mutex hdfsHashMutex = PTHREAD_MUTEX_INITIALIZER; mutex jvmMutex = PTHREAD_MUTEX_INITIALIZER; int mutexLock(mutex *m) { int ret = pthread_mutex_lock(m); if (ret) { fprintf(stderr, "mutexLock: pthread_mutex_lock failed with error %d\n", ret); } return ret; } int mutexUnlock(mutex *m) { int ret = pthread_mutex_unlock(m); if (ret) { fprintf(stderr, "mutexUnlock: pthread_mutex_unlock failed with error %d\n", ret); } return ret; } ================================================ FILE: src/libhdfs/os/posix/platform.h ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBHDFS_PLATFORM_H #define LIBHDFS_PLATFORM_H #include /* Use gcc type-checked format arguments. */ #define TYPE_CHECKED_PRINTF_FORMAT(formatArg, varArgs) \ __attribute__((format(printf, formatArg, varArgs))) /* * Mutex and thread data types defined by pthreads. */ typedef pthread_mutex_t mutex; typedef pthread_t threadId; #endif ================================================ FILE: src/libhdfs/os/posix/thread.c ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "os/thread.h" #include #include /** * Defines a helper function that adapts function pointer provided by caller to * the type required by pthread_create. * * @param toRun thread to run * @return void* result of running thread (always NULL) */ static void* runThread(void *toRun) { const thread *t = toRun; t->start(t->arg); return NULL; } int threadCreate(thread *t) { int ret; ret = pthread_create(&t->id, NULL, runThread, t); if (ret) { fprintf(stderr, "threadCreate: pthread_create failed with error %d\n", ret); } return ret; } int threadJoin(const thread *t) { int ret = pthread_join(t->id, NULL); if (ret) { fprintf(stderr, "threadJoin: pthread_join failed with error %d\n", ret); } return ret; } ================================================ FILE: src/libhdfs/os/posix/thread_local_storage.c ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "os/thread_local_storage.h" #include #include #include /** Key that allows us to retrieve thread-local storage */ static pthread_key_t gTlsKey; /** nonzero if we succeeded in initializing gTlsKey. Protected by the jvmMutex */ static int gTlsKeyInitialized = 0; /** * The function that is called whenever a thread with libhdfs thread local data * is destroyed. * * @param v The thread-local data */ static void hdfsThreadDestructor(void *v) { JavaVM *vm; JNIEnv *env = v; jint ret; ret = (*env)->GetJavaVM(env, &vm); if (ret) { fprintf(stderr, "hdfsThreadDestructor: GetJavaVM failed with error %d\n", ret); (*env)->ExceptionDescribe(env); } else { (*vm)->DetachCurrentThread(vm); } } int threadLocalStorageGet(JNIEnv **env) { int ret = 0; if (!gTlsKeyInitialized) { ret = pthread_key_create(&gTlsKey, hdfsThreadDestructor); if (ret) { fprintf(stderr, "threadLocalStorageGet: pthread_key_create failed with error %d\n", ret); return ret; } gTlsKeyInitialized = 1; } *env = pthread_getspecific(gTlsKey); return ret; } int threadLocalStorageSet(JNIEnv *env) { int ret = pthread_setspecific(gTlsKey, env); if (ret) { fprintf(stderr, "threadLocalStorageSet: pthread_setspecific failed with error %d\n", ret); hdfsThreadDestructor(env); } return ret; } ================================================ FILE: src/libhdfs/os/thread.h ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBHDFS_THREAD_H #define LIBHDFS_THREAD_H /* * Defines abstraction over platform-specific threads. */ #include "platform.h" /** Pointer to function to run in thread. */ typedef void (*threadProcedure)(void *); /** Structure containing a thread's ID, starting address and argument. */ typedef struct { threadId id; threadProcedure start; void *arg; } thread; /** * Creates and immediately starts a new thread. * * @param t thread to create * @return 0 if successful, non-zero otherwise */ int threadCreate(thread *t); /** * Joins to the given thread, blocking if necessary. * * @param t thread to join * @return 0 if successful, non-zero otherwise */ int threadJoin(const thread *t); #endif ================================================ FILE: src/libhdfs/os/thread_local_storage.h ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBHDFS_THREAD_LOCAL_STORAGE_H #define LIBHDFS_THREAD_LOCAL_STORAGE_H /* * Defines abstraction over platform-specific thread-local storage. libhdfs * currently only needs thread-local storage for a single piece of data: the * thread's JNIEnv. For simplicity, this interface is defined in terms of * JNIEnv, not general-purpose thread-local storage of any arbitrary data. */ #include /* * Most operating systems support the more efficient __thread construct, which * is initialized by the linker. The following macros use this technique on the * operating systems that support it. */ #ifdef HAVE_BETTER_TLS #define THREAD_LOCAL_STORAGE_GET_QUICK() \ static __thread JNIEnv *quickTlsEnv = NULL; \ { \ if (quickTlsEnv) { \ return quickTlsEnv; \ } \ } #define THREAD_LOCAL_STORAGE_SET_QUICK(env) \ { \ quickTlsEnv = (env); \ } #else #define THREAD_LOCAL_STORAGE_GET_QUICK() #define THREAD_LOCAL_STORAGE_SET_QUICK(env) #endif /** * Gets the JNIEnv in thread-local storage for the current thread. If the call * succeeds, and there is a JNIEnv associated with this thread, then returns 0 * and populates env. If the call succeeds, but there is no JNIEnv associated * with this thread, then returns 0 and sets JNIEnv to NULL. If the call fails, * then returns non-zero. Only one thread at a time may execute this function. * The caller is responsible for enforcing mutual exclusion. * * @param env JNIEnv out parameter * @return 0 if successful, non-zero otherwise */ int threadLocalStorageGet(JNIEnv **env); /** * Sets the JNIEnv in thread-local storage for the current thread. * * @param env JNIEnv to set * @return 0 if successful, non-zero otherwise */ int threadLocalStorageSet(JNIEnv *env); #endif ================================================ FILE: src/libhdfs/os/windows/inttypes.h ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBHDFS_INTTYPES_H #define LIBHDFS_INTTYPES_H /* On Windows, inttypes.h does not exist, so manually define what we need. */ #define PRId64 "I64d" #define PRIu64 "I64u" typedef unsigned __int64 uint64_t; #endif ================================================ FILE: src/libhdfs/os/windows/mutexes.c ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "os/mutexes.h" #include mutex hdfsHashMutex; mutex jvmMutex; /** * Unfortunately, there is no simple static initializer for a critical section. * Instead, the API requires calling InitializeCriticalSection. Since libhdfs * lacks an explicit initialization function, there is no obvious existing place * for the InitializeCriticalSection calls. To work around this, we define an * initialization function and instruct the linker to set a pointer to that * function as a user-defined global initializer. See discussion of CRT * Initialization: * http://msdn.microsoft.com/en-us/library/bb918180.aspx */ static void __cdecl initializeMutexes(void) { InitializeCriticalSection(&hdfsHashMutex); InitializeCriticalSection(&jvmMutex); } #pragma section(".CRT$XCU", read) __declspec(allocate(".CRT$XCU")) const void (__cdecl *pInitialize)(void) = initializeMutexes; int mutexLock(mutex *m) { EnterCriticalSection(m); return 0; } int mutexUnlock(mutex *m) { LeaveCriticalSection(m); return 0; } ================================================ FILE: src/libhdfs/os/windows/platform.h ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBHDFS_PLATFORM_H #define LIBHDFS_PLATFORM_H #include #include #include /* * O_ACCMODE defined to match Linux definition. */ #ifndef O_ACCMODE #define O_ACCMODE 0x0003 #endif /* * Windows has a different name for its maximum path length constant. */ #ifndef PATH_MAX #define PATH_MAX MAX_PATH #endif /* * Windows does not define EDQUOT and ESTALE in errno.h. The closest equivalents * are these constants from winsock.h. */ #ifndef EDQUOT #define EDQUOT WSAEDQUOT #endif #ifndef ESTALE #define ESTALE WSAESTALE #endif /* * gcc-style type-checked format arguments are not supported on Windows, so just * stub this macro. */ #define TYPE_CHECKED_PRINTF_FORMAT(formatArg, varArgs) /* * Define macros for various string formatting functions not defined on Windows. * Where possible, we reroute to one of the secure CRT variants. On Windows, * the preprocessor does support variadic macros, even though they weren't * defined until C99. */ #define snprintf(str, size, format, ...) \ _snprintf_s((str), (size), _TRUNCATE, (format), __VA_ARGS__) #define strncpy(dest, src, n) \ strncpy_s((dest), (n), (src), _TRUNCATE) #define strtok_r(str, delim, saveptr) \ strtok_s((str), (delim), (saveptr)) #define vsnprintf(str, size, format, ...) \ vsnprintf_s((str), (size), _TRUNCATE, (format), __VA_ARGS__) /* * Mutex data type defined as Windows CRITICAL_SECTION. A critical section (not * Windows mutex) is used, because libhdfs only needs synchronization of multiple * threads within a single process, not synchronization across process * boundaries. */ typedef CRITICAL_SECTION mutex; /* * Thread data type defined as HANDLE to a Windows thread. */ typedef HANDLE threadId; #endif ================================================ FILE: src/libhdfs/os/windows/thread.c ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "os/thread.h" #include #include /** * Defines a helper function that adapts function pointer provided by caller to * the type required by CreateThread. * * @param toRun thread to run * @return DWORD result of running thread (always 0) */ static DWORD WINAPI runThread(LPVOID toRun) { const thread *t = toRun; t->start(t->arg); return 0; } int threadCreate(thread *t) { DWORD ret = 0; HANDLE h; h = CreateThread(NULL, 0, runThread, t, 0, NULL); if (h) { t->id = h; } else { ret = GetLastError(); fprintf(stderr, "threadCreate: CreateThread failed with error %d\n", ret); } return ret; } int threadJoin(const thread *t) { DWORD ret = WaitForSingleObject(t->id, INFINITE); switch (ret) { case WAIT_OBJECT_0: break; case WAIT_FAILED: ret = GetLastError(); fprintf(stderr, "threadJoin: WaitForSingleObject failed with error %d\n", ret); break; default: fprintf(stderr, "threadJoin: WaitForSingleObject unexpected error %d\n", ret); break; } return ret; } ================================================ FILE: src/libhdfs/os/windows/thread_local_storage.c ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "os/thread_local_storage.h" #include #include #include /** Key that allows us to retrieve thread-local storage */ static DWORD gTlsIndex = TLS_OUT_OF_INDEXES; /** * If the current thread has a JNIEnv in thread-local storage, then detaches the * current thread from the JVM. */ static void detachCurrentThreadFromJvm() { JNIEnv *env = NULL; JavaVM *vm; jint ret; if (threadLocalStorageGet(&env) || !env) { return; } ret = (*env)->GetJavaVM(env, &vm); if (ret) { fprintf(stderr, "detachCurrentThreadFromJvm: GetJavaVM failed with error %d\n", ret); (*env)->ExceptionDescribe(env); } else { (*vm)->DetachCurrentThread(vm); } } /** * Unlike pthreads, the Windows API does not seem to provide a convenient way to * hook a callback onto thread shutdown. However, the Windows portable * executable format does define a concept of thread-local storage callbacks. * Here, we define a function and instruct the linker to set a pointer to that * function in the segment for thread-local storage callbacks. See page 85 of * Microsoft Portable Executable and Common Object File Format Specification: * http://msdn.microsoft.com/en-us/gg463119.aspx * This technique only works for implicit linking (OS loads DLL on demand), not * for explicit linking (user code calls LoadLibrary directly). This effectively * means that we have a known limitation: libhdfs may not work correctly if a * Windows application attempts to use it via explicit linking. * * @param h module handle * @param reason the reason for calling the callback * @param pv reserved, unused */ static void NTAPI tlsCallback(PVOID h, DWORD reason, PVOID pv) { DWORD tlsIndex; switch (reason) { case DLL_THREAD_DETACH: detachCurrentThreadFromJvm(); break; case DLL_PROCESS_DETACH: detachCurrentThreadFromJvm(); tlsIndex = gTlsIndex; gTlsIndex = TLS_OUT_OF_INDEXES; if (!TlsFree(tlsIndex)) { fprintf(stderr, "tlsCallback: TlsFree failed with error %d\n", GetLastError()); } break; default: break; } } /* * A variable named _tls_used contains the TLS directory, which contains a list * of pointers to callback functions. Normally, the linker won't retain this * variable unless the executable has implicit thread-local variables, defined * using the __declspec(thread) extended storage-class modifier. libhdfs * doesn't use __declspec(thread), and we have no guarantee that the executable * linked to libhdfs will use __declspec(thread). By forcing the linker to * reference _tls_used, we guarantee that the binary retains the TLS directory. * See Microsoft Visual Studio 10.0/VC/crt/src/tlssup.c . */ #ifdef _WIN64 #pragma comment(linker, "/INCLUDE:_tls_used") #else #pragma comment(linker, "/INCLUDE:__tls_used") #endif /* * We must retain a pointer to the callback function. Force the linker to keep * this symbol, even though it appears that nothing in our source code uses it. */ #ifdef _WIN64 #pragma comment(linker, "/INCLUDE:pTlsCallback") #else #pragma comment(linker, "/INCLUDE:_pTlsCallback") #endif /* * Define constant pointer to our callback, and tell the linker to pin it into * the TLS directory so that it receives thread callbacks. Use external linkage * to protect against the linker discarding the seemingly unused symbol. */ #pragma const_seg(".CRT$XLB") extern const PIMAGE_TLS_CALLBACK pTlsCallback; const PIMAGE_TLS_CALLBACK pTlsCallback = tlsCallback; #pragma const_seg() int threadLocalStorageGet(JNIEnv **env) { LPVOID tls; DWORD ret; if (TLS_OUT_OF_INDEXES == gTlsIndex) { gTlsIndex = TlsAlloc(); if (TLS_OUT_OF_INDEXES == gTlsIndex) { fprintf(stderr, "threadLocalStorageGet: TlsAlloc failed with error %d\n", TLS_OUT_OF_INDEXES); return TLS_OUT_OF_INDEXES; } } tls = TlsGetValue(gTlsIndex); if (tls) { *env = tls; return 0; } else { ret = GetLastError(); if (ERROR_SUCCESS == ret) { /* Thread-local storage contains NULL, because we haven't set it yet. */ *env = NULL; return 0; } else { /* * The API call failed. According to documentation, TlsGetValue cannot * fail as long as the index is a valid index from a successful TlsAlloc * call. This error handling is purely defensive. */ fprintf(stderr, "threadLocalStorageGet: TlsGetValue failed with error %d\n", ret); return ret; } } } int threadLocalStorageSet(JNIEnv *env) { DWORD ret = 0; if (!TlsSetValue(gTlsIndex, (LPVOID)env)) { ret = GetLastError(); fprintf(stderr, "threadLocalStorageSet: TlsSetValue failed with error %d\n", ret); detachCurrentThreadFromJvm(env); } return ret; } ================================================ FILE: src/libhdfs/os/windows/unistd.h ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBHDFS_UNISTD_H #define LIBHDFS_UNISTD_H /* On Windows, unistd.h does not exist, so manually define what we need. */ #include /* Declares getpid(). */ #include /* Re-route sleep to Sleep, converting units from seconds to milliseconds. */ #define sleep(seconds) Sleep((seconds) * 1000) #endif ================================================ FILE: src/native_core_hdfs/hdfs_file.cc ================================================ /* BEGIN_COPYRIGHT * * Copyright 2009-2026 CRS4. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * END_COPYRIGHT */ #include "hdfs_file.h" #include #define PYDOOP_TEXT_ENCODING "utf-8" PyObject* FileClass_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { FileInfo *self = NULL; self = (FileInfo *)type->tp_alloc(type, 0); if (self != NULL) { self->fs = NULL; self->file = NULL; if (NULL == (self->name = PyUnicode_FromString(""))) { Py_DECREF(self); return NULL; } if (NULL == (self->mode = PyUnicode_FromString(""))) { Py_DECREF(self); return NULL; } self->size = 0; self->buff_size = 0; self->replication = 1; self->blocksize = 0; self->closed = 0; } return (PyObject *)self; } void FileClass_dealloc(FileInfo* self) { self->file = NULL; Py_TYPE(self)->tp_free((PyObject*)self); } int FileClass_init(FileInfo *self, PyObject *args, PyObject *kwds) { PyObject *name = NULL, *mode = NULL, *tmp = NULL; if (!PyArg_ParseTuple(args, "OOOO", &(self->fs), &(self->file), &name, &mode)) { return -1; } if (name) { tmp = self->name; Py_INCREF(name); self->name = name; Py_XDECREF(tmp); } if (mode) { tmp = self->mode; Py_INCREF(mode); self->mode = mode; Py_XDECREF(tmp); } return 0; } int FileClass_init_internal(FileInfo *self, hdfsFS fs, hdfsFile file) { self->fs = fs; self->file = file; return 0; } PyObject* FileClass_close(FileInfo* self){ int result = hdfsCloseFile(self->fs, self->file); if (result < 0) { return PyErr_SetFromErrno(PyExc_IOError); } else { self->closed = 1; return PyBool_FromLong(1); } } PyObject* FileClass_getclosed(FileInfo* self, void* closure) { return PyBool_FromLong(self->closed); } PyObject* FileClass_getbuff_size(FileInfo* self, void* closure) { return PyLong_FromLong(self->buff_size); } PyObject* FileClass_getname(FileInfo* self, void* closure) { Py_INCREF(self->name); return self->name; } PyObject* FileClass_getmode(FileInfo* self, void* closure) { Py_INCREF(self->mode); return self->mode; } PyObject* FileClass_readable(FileInfo* self) { return PyBool_FromLong(hdfsFileIsOpenForRead(self->file)); } PyObject* FileClass_writable(FileInfo* self) { return PyBool_FromLong(hdfsFileIsOpenForWrite(self->file)); } PyObject* FileClass_seekable(FileInfo* self) { return PyBool_FromLong(hdfsFileIsOpenForRead(self->file)); } PyObject* FileClass_available(FileInfo *self){ int available = hdfsAvailable(self->fs, self->file); if (available < 0) return PyErr_SetFromErrno(PyExc_IOError); else return PyLong_FromLong(available); } static int _ensure_open_for_reading(FileInfo* self) { if (!hdfsFileIsOpenForRead(self->file)) { PyErr_SetString(PyExc_IOError, "File is not opened in READ ('r') mode"); return 0; // False } return 1; // True } static Py_ssize_t _read_into_pybuf(FileInfo *self, char* buf, Py_ssize_t nbytes) { if (nbytes < 0) { PyErr_SetString(PyExc_ValueError, "nbytes must be >= 0"); return -1; } tSize bytes_read; Py_BEGIN_ALLOW_THREADS; bytes_read = hdfsRead(self->fs, self->file, buf, nbytes); Py_END_ALLOW_THREADS; if (bytes_read < 0) { // error PyErr_SetFromErrno(PyExc_IOError); return -1; } return bytes_read; } static PyObject* _read_new_pybuf(FileInfo* self, Py_ssize_t nbytes) { if (nbytes < 0) { PyErr_SetString(PyExc_ValueError, "nbytes must be >= 0"); return NULL; } // Allocate an uninitialized buffer object. // We then access and directly modify the buffer's internal memory. This is // ok until we release this string "into the wild". PyObject* retval = _PyBuf_FromStringAndSize(NULL, nbytes); if (!retval) return PyErr_NoMemory(); Py_ssize_t bytes_read = _read_into_pybuf(self, _PyBuf_AS_STRING(retval), nbytes); if (bytes_read >= 0) { // If bytes_read >= 0, read worked properly. But, if bytes_read < nbytes // we got fewer bytes than requested (maybe we reached EOF?). We need // to shrink the string to the correct length. In case of error the // call to _PyString_Resize frees the original string, sets the // appropriate python exception and returns -1. if (bytes_read >= nbytes || _PyBuf_Resize(&retval, bytes_read) >= 0) return retval; // all good } // If we get here something's gone wrong. The exception should already be set. Py_DECREF(retval); return NULL; } /* * Seek to `pos` and read `nbytes` bytes into a the provided buffer. * * \return: Number of bytes read. In case of error this function sets * the appropriate Python exception and returns -1. */ static Py_ssize_t _pread_into_pybuf(FileInfo *self, char* buffer, Py_ssize_t pos, Py_ssize_t nbytes) { Py_ssize_t orig_position = hdfsTell(self->fs, self->file); if (orig_position < 0) { PyErr_SetFromErrno(PyExc_IOError); return -1; } if (hdfsSeek(self->fs, self->file, pos) < 0) { PyErr_SetFromErrno(PyExc_IOError); return -1; } tSize bytes_read = _read_into_pybuf(self, buffer, nbytes); if (bytes_read < 0) { PyErr_SetFromErrno(PyExc_IOError); return -1; } if (hdfsSeek(self->fs, self->file, orig_position) < 0) { PyErr_SetFromErrno(PyExc_IOError); return -1; } return bytes_read; } static PyObject* _pread_new_pybuf(FileInfo* self, Py_ssize_t pos, Py_ssize_t nbytes) { if (nbytes < 0) { PyErr_SetString(PyExc_ValueError, "nbytes must be >= 0"); return NULL; } // Allocate an uninitialized string object. PyObject* retval = _PyBuf_FromStringAndSize(NULL, nbytes); if (!retval) return PyErr_NoMemory(); Py_ssize_t bytes_read = _pread_into_pybuf(self, _PyBuf_AS_STRING(retval), pos, nbytes); if (bytes_read >= 0) { // If bytes_read >= 0, read worked properly. But, if bytes_read < nbytes // we got fewer bytes than requested (maybe we reached EOF?). We need // to shrink the string to the correct length. In case of error the // call to _PyString_Resize frees the original string, sets the // appropriate python exception and returns -1. if (bytes_read >= nbytes || _PyBuf_Resize(&retval, bytes_read) >= 0) return retval; // all good } // If we get here something's gone wrong. The exception should already be set. Py_DECREF(retval); return NULL; } PyObject* FileClass_read(FileInfo *self, PyObject *args, PyObject *kwds){ Py_ssize_t nbytes = 0; if (!_ensure_open_for_reading(self)) return NULL; if (! PyArg_ParseTuple(args, "n", &(nbytes))) return NULL; if (nbytes < 0) { PyErr_SetString(PyExc_ValueError, "nbytes must be >= 0"); return NULL; } else if (nbytes == 0) { return _PyBuf_FromString(""); } // else nbytes > 0 return _read_new_pybuf(self, nbytes); } PyObject* FileClass_read_chunk(FileInfo *self, PyObject *args, PyObject *kwds){ Py_buffer buffer = {NULL, NULL}; if (!_ensure_open_for_reading(self)) return NULL; if (! PyArg_ParseTuple(args, "w*", &buffer)) return NULL; Py_ssize_t bytes_read = _read_into_pybuf(self, (char*)buffer.buf, buffer.len); PyBuffer_Release(&buffer); if (bytes_read >= 0) return Py_BuildValue("n", bytes_read); else return NULL; } PyObject* FileClass_pread(FileInfo *self, PyObject *args, PyObject *kwds){ Py_ssize_t position = 0; Py_ssize_t nbytes = 0; if (!_ensure_open_for_reading(self)) return NULL; if (! PyArg_ParseTuple(args, "nn", &position, &nbytes)) return NULL; if (position < 0) { errno = EINVAL; PyErr_SetFromErrno(PyExc_IOError); errno = 0; return NULL; } if (nbytes == 0) return _PyBuf_FromString(""); // else return _pread_new_pybuf(self, position, nbytes); } PyObject* FileClass_pread_chunk(FileInfo *self, PyObject *args, PyObject *kwds){ Py_buffer buffer = {NULL, NULL}; Py_ssize_t position = 0; if (!_ensure_open_for_reading(self)) return NULL; if (! PyArg_ParseTuple(args, "nw*", &position, &buffer)) return NULL; if (position < 0) { errno = EINVAL; PyErr_SetFromErrno(PyExc_IOError); errno = 0; return NULL; } Py_ssize_t bytes_read = _pread_into_pybuf(self, (char*)buffer.buf, position, buffer.len); PyBuffer_Release(&buffer); if (bytes_read >= 0) return Py_BuildValue("n", bytes_read); else return NULL; } PyObject* FileClass_seek(FileInfo *self, PyObject *args, PyObject *kwds) { tOffset position = 0, curpos = 0; int whence = SEEK_SET; if (!PyArg_ParseTuple(args, "n|i", &position, &whence)) return NULL; switch (whence) { case SEEK_SET: break; case SEEK_CUR: curpos = hdfsTell(self->fs, self->file); if (curpos < 0) { return PyErr_SetFromErrno(PyExc_IOError); } position += curpos; break; case SEEK_END: position += self->size; break; default: PyErr_SetString(PyExc_ValueError, "unsupported whence value"); return NULL; } /* HDFS does not support seeking past end of file */ if (position < 0 || position > self->size) { errno = EINVAL; PyErr_SetFromErrno(PyExc_IOError); errno = 0; return NULL; } if (hdfsSeek(self->fs, self->file, position) < 0) { return PyErr_SetFromErrno(PyExc_IOError); } return PyLong_FromLong(position); } PyObject* FileClass_tell(FileInfo *self, PyObject *args, PyObject *kwds){ tOffset offset = hdfsTell(self->fs, self->file); if (offset >= 0) return Py_BuildValue("n", offset); else { PyErr_SetFromErrno(PyExc_IOError); return NULL; } } PyObject* FileClass_write(FileInfo* self, PyObject *args, PyObject *kwds) { PyObject *input = NULL; Py_buffer buffer = {NULL, NULL}; if (!hdfsFileIsOpenForWrite(self->file)) { PyErr_SetString(PyExc_IOError, "not writable"); return NULL; } if (!PyArg_ParseTuple(args, "O", &input)) { return NULL; } if (PyObject_GetBuffer(input, &buffer, PyBUF_SIMPLE) < 0) { PyErr_SetString(PyExc_TypeError, "Argument not accessible as a buffer"); return NULL; } Py_ssize_t written; Py_BEGIN_ALLOW_THREADS; written = hdfsWrite(self->fs, self->file, buffer.buf, buffer.len); Py_END_ALLOW_THREADS; PyBuffer_Release(&buffer); if (written < 0) { PyErr_SetFromErrno(PyExc_IOError); return NULL; } return Py_BuildValue("n", written); } PyObject* FileClass_flush(FileInfo *self){ if (!hdfsFileIsOpenForWrite(self->file)) { Py_RETURN_NONE; } int result = hdfsFlush(self->fs, self->file); if (result >= 0) { Py_RETURN_NONE; } else { PyErr_SetFromErrno(PyExc_IOError); return NULL; } } ================================================ FILE: src/native_core_hdfs/hdfs_file.h ================================================ /* BEGIN_COPYRIGHT * * Copyright 2009-2026 CRS4. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * END_COPYRIGHT */ #ifndef PYTHON_HDFS_FILE_TYPE #define PYTHON_HDFS_FILE_TYPE #include #include #include #include // std::pair support #include #include #include #include #include #include "../py3k_compat.h" typedef struct { PyObject_HEAD hdfsFS fs; hdfsFile file; PyObject *name; PyObject *mode; tOffset size; int buff_size; short replication; int blocksize; int closed; } FileInfo; PyObject* FileClass_new(PyTypeObject *type, PyObject *args, PyObject *kwds); void FileClass_dealloc(FileInfo* self); int FileClass_init(FileInfo *self, PyObject *args, PyObject *kwds); int FileClass_init_internal(FileInfo *self, hdfsFS fs, hdfsFile file); PyObject* FileClass_close(FileInfo* self); PyObject* FileClass_getclosed(FileInfo* self, void* closure); PyObject* FileClass_getbuff_size(FileInfo* self, void* closure); PyObject* FileClass_getname(FileInfo* self, void* closure); PyObject* FileClass_getmode(FileInfo* self, void* closure); PyObject* FileClass_readable(FileInfo* self); PyObject* FileClass_writable(FileInfo* self); PyObject* FileClass_seekable(FileInfo* self); PyObject* FileClass_mode(FileInfo* self); PyObject* FileClass_write(FileInfo* self, PyObject *args, PyObject *kwds); PyObject* FileClass_get_mode(FileInfo *self); PyObject* FileClass_available(FileInfo *self); PyObject* FileClass_read(FileInfo *self, PyObject *args, PyObject *kwds); PyObject* FileClass_read_chunk(FileInfo *self, PyObject *args, PyObject *kwds); PyObject* FileClass_pread(FileInfo *self, PyObject *args, PyObject *kwds); PyObject* FileClass_pread_chunk(FileInfo *self, PyObject *args, PyObject *kwds); PyObject* FileClass_seek(FileInfo *self, PyObject *args, PyObject *kwds); PyObject* FileClass_tell(FileInfo *self, PyObject *args, PyObject *kwds); PyObject* FileClass_flush(FileInfo *self); #endif ================================================ FILE: src/native_core_hdfs/hdfs_fs.cc ================================================ /* BEGIN_COPYRIGHT * * Copyright 2009-2026 CRS4. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * END_COPYRIGHT */ #include "hdfs_fs.h" #include "hdfs_file.h" #include #include #include #include #include #define MAX_WD_BUFFSIZE 2048 #define str_empty(s) ((s) == NULL || (*(s) == '\0')) PyObject* FsClass_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { FsInfo *self = NULL; self = (FsInfo *)type->tp_alloc(type, 0); if (self != NULL) { self->host = NULL; self->port = 0; self->user = NULL; self->group = NULL; self->_fs = NULL; } return (PyObject *)self; } void FsClass_dealloc(FsInfo* self) { Py_TYPE(self)->tp_free((PyObject*)self); } int FsClass_init(FsInfo *self, PyObject *args, PyObject *kwds) { // XXX: This call to PyArg_ParseTuple doesn't support non-ASCII characters in // the input strings (host, user, group) if (! PyArg_ParseTuple(args, "z|izz", &(self->host), &(self->port), &(self->user), &(self->group))) return -1; if (str_empty(self->host)) self->host = NULL; if (str_empty(self->user)) self->user = NULL; if (str_empty(self->group)) self->group = NULL; // Connect cycles and retries more than once if necessary. Better let // other Python threads through. Py_BEGIN_ALLOW_THREADS; if (self->user != NULL) { self->_fs = hdfsConnectAsUser(self->host, self->port, self->user); } else { self->_fs = hdfsConnect(self->host, self->port); } Py_END_ALLOW_THREADS; if (!self->_fs) { PyErr_SetFromErrno(PyExc_RuntimeError); return -1; } return 0; } PyObject* FsClass_close(FsInfo* self) { hdfsDisconnect(self->_fs); Py_RETURN_NONE; } PyObject* FsClass_get_working_directory(FsInfo* self) { const size_t bufferSize = MAX_WD_BUFFSIZE; char *buffer = (char*)PyMem_Malloc(bufferSize); if (!buffer) return PyErr_NoMemory(); if (hdfsGetWorkingDirectory(self->_fs, buffer, bufferSize) == NULL) { PyErr_SetString(PyExc_RuntimeError, "Cannot get working directory."); PyMem_Free(buffer); return NULL; } PyObject* result = PyUnicode_FromString(buffer); PyMem_Free(buffer); if (!result) return PyErr_NoMemory(); return result; } PyObject* FsClass_get_path_info(FsInfo* self, PyObject *args, PyObject *kwds) { char* path = NULL; PyObject* retval = NULL; hdfsFileInfo* info = NULL; if (!PyArg_ParseTuple(args, "es", "utf-8", &path)) { return NULL; } if (str_empty(path)) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; info = hdfsGetPathInfo(self->_fs, path); Py_END_ALLOW_THREADS; if (info == NULL) { PyMem_Free(path); return PyErr_SetFromErrno(PyExc_IOError); } retval = Py_BuildValue("{s:O,s:s,s:s,s:i,s:i,s:h,s:s,s:h,s:i,s:O,s:L}", "name", PyUnicode_FromString(info->mName), "kind", info->mKind == kObjectKindDirectory ? "directory" : "file", "group", info->mGroup, "last_mod", info->mLastMod, "last_access", info->mLastAccess, "replication", info->mReplication, "owner", info->mOwner, "permissions", info->mPermissions, "block_size", info->mBlockSize, "path", PyUnicode_FromString(info->mName), "size", info->mSize ); PyMem_Free(path); hdfsFreeFileInfo(info, 1); return retval; } PyObject* FsClass_get_hosts(FsInfo* self, PyObject *args, PyObject *kwds) { Py_ssize_t start = 0, length = 0; PyObject* result = NULL; char* path = NULL; char*** hosts = NULL; if (!PyArg_ParseTuple(args, "esnn", "utf-8", &path, &start, &length)) { return NULL; } if (str_empty(path)) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } if (start < 0 || length < 0) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Start position and length must be >= 0"); return NULL; } Py_BEGIN_ALLOW_THREADS; hosts = hdfsGetHosts(self->_fs, path, start, length); Py_END_ALLOW_THREADS; PyMem_Free(path); if (!hosts) { return PyErr_SetFromErrno(PyExc_RuntimeError); } result = PyList_New(0); if (!result) goto mem_error; for (int blockNumber = 0; hosts[blockNumber] != NULL; ++blockNumber) { PyObject* blockHosts = PyList_New(0); if (!blockHosts) goto mem_error; for (int iBlockHost = 0; hosts[blockNumber][iBlockHost] != NULL; ++iBlockHost) { PyObject* str = PyUnicode_FromString(hosts[blockNumber][iBlockHost]); if (!str) goto mem_error; if (PyList_Append(blockHosts, str) < 0) goto mem_error; } if (PyList_Append(result, blockHosts) < 0) goto mem_error; } goto done; // skip the mem_error section mem_error: PyErr_SetString(PyExc_MemoryError, "Error allocating host structure"); Py_XDECREF(result); result = NULL; // fall through done: if (hosts) hdfsFreeHosts(hosts); return result; } PyObject* FsClass_get_default_block_size(FsInfo* self) { tOffset size = hdfsGetDefaultBlockSize(self->_fs); return PyLong_FromSsize_t(size); } PyObject* FsClass_get_used(FsInfo* self) { tOffset size = hdfsGetUsed(self->_fs); return PyLong_FromSsize_t(size); } PyObject* FsClass_set_replication(FsInfo* self, PyObject* args, PyObject* kwds) { char* path = NULL; short replication = 0; int result = 0; if (!PyArg_ParseTuple(args, "esh", "utf-8", &path, &replication)) return NULL; if (str_empty(path)) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; result = hdfsSetReplication(self->_fs, path, replication); Py_END_ALLOW_THREADS; PyMem_Free(path); if (result < 0) { return PyErr_SetFromErrno(PyExc_IOError); } Py_RETURN_NONE; } PyObject* FsClass_set_working_directory(FsInfo* self, PyObject* args, PyObject* kwds) { char* path = NULL; int result = 0; if (!PyArg_ParseTuple(args, "es", "utf-8", &path)) return NULL; if (str_empty(path)) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; result = hdfsSetWorkingDirectory(self->_fs, path); Py_END_ALLOW_THREADS; PyMem_Free(path); if (result < 0) { return PyErr_SetFromErrno(PyExc_IOError); } Py_RETURN_NONE; } PyObject* FsClass_open_file(FsInfo* self, PyObject *args, PyObject *kwds) { PyObject* retval = NULL; char* path = NULL; const char* mode = MODE_READ; int flags = 0; int buff_size = 0; int blocksize = 0; short replication = 0; hdfsFile file = NULL; tOffset size = 0; hdfsFileInfo* info = NULL; if (!PyArg_ParseTuple(args, "es|sihi", "utf-8", &path, &mode, &buff_size, &replication, &blocksize)) { return NULL; } if (str_empty(path)) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } if (strcmp(mode, MODE_READ) == 0) { flags = O_RDONLY; } else if (strcmp(mode, MODE_WRITE) == 0) { flags = O_WRONLY; } else if (strcmp(mode, MODE_APPEND) == 0) { flags = O_WRONLY | O_APPEND; } else { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Invalid mode"); return NULL; } Py_BEGIN_ALLOW_THREADS; file = hdfsOpenFile(self->_fs, path, flags, buff_size, replication, blocksize); Py_END_ALLOW_THREADS; if (file == NULL) { PyMem_Free(path); return PyErr_SetFromErrno(PyExc_IOError); } PyObject* module = PyImport_ImportModule("pydoop.native_core_hdfs"); if (NULL == module) { PyMem_Free(path); free(file); return NULL; } PyObject *name = PyUnicode_FromString(path); PyObject *pymode = PyUnicode_FromString(mode); retval = PyObject_CallMethod(module, "CoreHdfsFile", "OOOO", self->_fs, file, name, pymode); Py_XDECREF(pymode); Py_XDECREF(name); Py_XDECREF(module); if (NULL == retval) { PyMem_Free(path); free(file); return NULL; } /* get file size for the SEEK_END variant of seek */ if (flags == O_RDONLY) { Py_BEGIN_ALLOW_THREADS; info = hdfsGetPathInfo(self->_fs, path); Py_END_ALLOW_THREADS; if (info == NULL) { PyMem_Free(path); return PyErr_SetFromErrno(PyExc_IOError); } size = info->mSize; hdfsFreeFileInfo(info, 1); } PyMem_Free(path); FileInfo *fileInfo = ((FileInfo*) retval); fileInfo->size = size; fileInfo->buff_size = buff_size; fileInfo->blocksize = blocksize; fileInfo->replication = replication; return retval; } PyObject *FsClass_get_capacity(FsInfo *self) { tOffset capacity; Py_BEGIN_ALLOW_THREADS; errno = 0; // hdfsGetCapacity forgets to clear errno capacity = hdfsGetCapacity(self->_fs); Py_END_ALLOW_THREADS; if (capacity < 0) { // two error cases are contemplated by the code in hdfsGetCapacity: // 1) exception from the Java method // 2) FS instance is not a DistributedFileSystem. // Here we copy their error textually. if (errno) PyErr_SetFromErrno(PyExc_IOError); else { PyErr_SetString(PyExc_RuntimeError, "hdfsGetCapacity works only on a DistributedFileSystem"); } return NULL; } return PyLong_FromSsize_t(capacity); } PyObject* FsClass_copy(FsInfo* self, PyObject *args, PyObject *kwds) { FsInfo* to_hdfs = NULL; char *from_path = NULL, *to_path = NULL; int result = 0; if (! PyArg_ParseTuple(args, "esOes", "utf-8", &from_path, &to_hdfs, "utf-8", &to_path)) { return NULL; } if (str_empty(from_path) || str_empty(to_path)) { PyMem_Free(from_path); PyMem_Free(to_path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; result = hdfsCopy(self->_fs, from_path, to_hdfs->_fs, to_path); Py_END_ALLOW_THREADS; PyMem_Free(from_path); PyMem_Free(to_path); if (result < 0) { return PyErr_SetFromErrno(PyExc_IOError); } return PyLong_FromLong(result); } PyObject *FsClass_exists(FsInfo *self, PyObject *args, PyObject *kwds) { char* path = NULL; int result = 0; if (! PyArg_ParseTuple(args, "es", "utf-8", &path)) return NULL; if (str_empty(path)) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; result = hdfsExists(self->_fs, path); Py_END_ALLOW_THREADS; PyMem_Free(path); // LP: hdfsExists (in some cases?) sets errno to ENOENT "[Errno 2] No such // file or directory" when the path doesn't exist or EEXIST in other cases. // I don't know why. Since that's what we're trying to test, I'll skip // checking errno here. The consequence is that when we return false it // may be because of an error and not because the path doesn't exist. // // if (result < 0 && errno) return PyErr_SetFromErrno(PyExc_IOError); return PyBool_FromLong(result >= 0 ? 1 : 0); } PyObject *FsClass_create_directory(FsInfo *self, PyObject *args, PyObject *kwds) { char* path = NULL; int result = 0; if (! PyArg_ParseTuple(args, "es", "utf-8", &path)) { return NULL; } if (str_empty(path)) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; result = hdfsCreateDirectory(self->_fs, path); Py_END_ALLOW_THREADS; PyMem_Free(path); if (result < 0) { return PyErr_SetFromErrno(PyExc_IOError); } Py_RETURN_NONE; } /* * Works on borrowed reference `dict`. * * \return 0 if successful * \return -1 if there was a problem. In that case, dict may contain * some values, but will be incomplete and should be discarded. */ static int setPathInfo(PyObject* dict, hdfsFileInfo* fileInfo) { if (dict == NULL || fileInfo == NULL) return -1; int error_code = 0; const char*const keys[] = { "name", "kind", "group", "last_mod", "last_access", "replication", "owner", "permissions", "block_size", "path", "size" }; const int n_fields = sizeof(keys) / sizeof(keys[0]); PyObject* values[n_fields]; int i = 0; // Prepare the values. We'll check for all errors in the "set" loop below // The order of these values MUST match the order of the keys above values[i++] = PyUnicode_FromString(fileInfo->mName); values[i++] = PyUnicode_FromString(fileInfo->mKind == kObjectKindDirectory ? "directory" : "file"); values[i++] = PyUnicode_FromString(fileInfo->mGroup); values[i++] = PyLong_FromLong(fileInfo->mLastMod); values[i++] = PyLong_FromLong(fileInfo->mLastAccess); values[i++] = PyLong_FromSize_t(fileInfo->mReplication); values[i++] = PyUnicode_FromString(fileInfo->mOwner); values[i++] = PyLong_FromSize_t(fileInfo->mPermissions); values[i++] = PyLong_FromLong(fileInfo->mBlockSize); values[i++] = PyUnicode_FromString(fileInfo->mName); values[i++] = PyLong_FromLongLong(fileInfo->mSize); for (i = 0; i < n_fields; ++i) { if (values[i] == NULL || PyDict_SetItemString(dict, keys[i], values[i]) < 0) { error_code = -1; break; // Don't DECREF here. The error handling code goes through the entire array // and thus we'd end up DECREFing some objects twice. } } for (i = 0; i < n_fields; ++i) { Py_XDECREF(values[i]); // some values may be null (if there was an error } return error_code; } PyObject *FsClass_list_directory(FsInfo *self, PyObject *args, PyObject *kwds) { PyObject* retval = NULL; char* path = NULL; hdfsFileInfo* pathList = NULL; int numEntries = 0; hdfsFileInfo* pathInfo = NULL; if (!PyArg_ParseTuple(args, "es", "utf-8", &path)) return NULL; if (str_empty(path)) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; pathInfo = hdfsGetPathInfo(self->_fs, path); PyMem_Free(path); if (!pathInfo) { Py_BLOCK_THREADS; // later we 'goto' skipping over END_ALLOW_THREADS PyErr_SetFromErrno(PyExc_IOError); goto error; } if (pathInfo->mKind == kObjectKindDirectory) { pathList = hdfsListDirectory(self->_fs, pathInfo->mName, &numEntries); // hdfsListDirectory returns NULL when a directory is empty, so to determine // whether there's been an error we also need to check errno if (!pathList && errno) { Py_BLOCK_THREADS; // later we 'goto' skipping over END_ALLOW_THREADS PyErr_SetFromErrno(PyExc_IOError); goto error; } } else { numEntries = 1; pathList = pathInfo; pathInfo = NULL; } Py_END_ALLOW_THREADS; retval = PyList_New(numEntries); if (!retval) goto mem_error; for (Py_ssize_t i = 0; i < numEntries; i++) { PyObject* infoDict = PyDict_New(); if (!infoDict) goto mem_error; PyList_SET_ITEM(retval, i, infoDict); if (setPathInfo(infoDict, &pathList[i]) < 0) { PyErr_SetString(PyExc_IOError, "Error getting file info"); goto error; } } goto done; // skip the error section mem_error: PyErr_SetString(PyExc_MemoryError, "Error allocating structures"); // fall through error: // in case of error DECREF our retval structure and return NULL if (retval != NULL) { Py_XDECREF(retval); retval = NULL; } done: // all code paths go through the 'done' section if (pathInfo != NULL) hdfsFreeFileInfo(pathInfo, 1); if (pathList != NULL) hdfsFreeFileInfo(pathList, numEntries); return retval; } PyObject *FsClass_move(FsInfo *self, PyObject *args, PyObject *kwds) { FsInfo* to_hdfs = NULL; char *from_path = NULL, *to_path = NULL; int result = 0; if (! PyArg_ParseTuple(args, "esOes", "utf-8", &from_path, &to_hdfs, "utf-8", &to_path)) { return NULL; } if (str_empty(from_path) || str_empty(to_path)) { PyMem_Free(from_path); PyMem_Free(to_path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; result = hdfsMove(self->_fs, from_path, to_hdfs->_fs, to_path); Py_END_ALLOW_THREADS; PyMem_Free(from_path); PyMem_Free(to_path); if (result < 0) { return PyErr_SetFromErrno(PyExc_IOError); } Py_RETURN_NONE; } PyObject *FsClass_rename(FsInfo *self, PyObject *args, PyObject *kwds) { char *from_path = NULL, *to_path = NULL; int result = 0; if (! PyArg_ParseTuple(args, "eses", "utf-8", &from_path, "utf-8", &to_path)) return NULL; if (str_empty(from_path) || str_empty(to_path)) { PyMem_Free(from_path); PyMem_Free(to_path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; result = hdfsRename(self->_fs, from_path, to_path); Py_END_ALLOW_THREADS; PyMem_Free(from_path); PyMem_Free(to_path); if (result < 0) { return PyErr_SetFromErrno(PyExc_IOError); } Py_RETURN_NONE; } PyObject *FsClass_delete(FsInfo *self, PyObject *args, PyObject *kwds) { char* path = NULL; int recursive = 1; int result = 0; if (!PyArg_ParseTuple(args, "es|i", "utf-8", &path, &recursive)) { return NULL; } if (str_empty(path)) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; result = hdfsDelete(self->_fs, path, recursive); Py_END_ALLOW_THREADS; PyMem_Free(path); if (result < 0) { return PyErr_SetFromErrno(PyExc_IOError); } Py_RETURN_NONE; } PyObject *FsClass_chmod(FsInfo *self, PyObject *args, PyObject *kwds) { char* path = NULL; short mode = 1; int result = 0; if (!PyArg_ParseTuple(args, "esh", "utf-8", &path, &mode)) { return NULL; } if (str_empty(path)) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; // hdfsChmod doesn't always set errno in case of error. We clear it // here so that after the call we'll be sure we're not looking at an old value errno = 0; result = hdfsChmod(self->_fs, path, mode); Py_END_ALLOW_THREADS; PyMem_Free(path); if (result >= 0) { Py_RETURN_NONE; } else { // there's been an error if (errno) { return PyErr_SetFromErrno(PyExc_IOError); } else { PyErr_SetString(PyExc_IOError, "Unknown error"); return NULL; } } } PyObject *FsClass_chown(FsInfo *self, PyObject *args, PyObject *kwds) { char *path = NULL, *input_user = NULL, *input_group = NULL; int result = 0; hdfsFileInfo* fileInfo = NULL; if (! PyArg_ParseTuple(args, "es|eses", "utf-8", &path, "utf-8", &input_user, "utf-8", &input_group)) { return NULL; } if (str_empty(path)) { PyMem_Free(path); PyMem_Free(input_user); PyMem_Free(input_group); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; fileInfo = hdfsGetPathInfo(self->_fs, path); if (NULL == fileInfo) { PyMem_Free(path); PyMem_Free(input_user); PyMem_Free(input_group); return PyErr_SetFromErrno(PyExc_IOError); } const char* new_user = str_empty(input_user) ? fileInfo->mOwner : input_user; const char* new_group = str_empty(input_group) ? fileInfo->mGroup : input_group; result = hdfsChown(self->_fs, path, new_user, new_group); Py_END_ALLOW_THREADS; PyMem_Free(path); PyMem_Free(input_user); PyMem_Free(input_group); hdfsFreeFileInfo(fileInfo, 1); if (result < 0) { return PyErr_SetFromErrno(PyExc_IOError); } Py_RETURN_NONE; } PyObject *FsClass_utime(FsInfo *self, PyObject *args, PyObject *kwds) { char* path = NULL; tTime mtime = 0, atime = 0; int result = 0; if (! PyArg_ParseTuple(args, "esll", "utf-8", &path, &mtime, &atime)) { return NULL; } if (str_empty(path)) { PyMem_Free(path); PyErr_SetString(PyExc_ValueError, "Empty path"); return NULL; } Py_BEGIN_ALLOW_THREADS; result = hdfsUtime(self->_fs, path, mtime, atime); Py_END_ALLOW_THREADS; PyMem_Free(path); if (result < 0) { return PyErr_SetFromErrno(PyExc_IOError); } Py_RETURN_NONE; } ================================================ FILE: src/native_core_hdfs/hdfs_fs.h ================================================ /* BEGIN_COPYRIGHT * * Copyright 2009-2026 CRS4. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * END_COPYRIGHT */ #ifndef PYTHON_HDFS_FS_TYPE #define PYTHON_HDFS_FS_TYPE #include #include #include #include // std::pair support #include #include #include #include #include #include "../py3k_compat.h" #define MODE_READ "r" #define MODE_WRITE "w" #define MODE_APPEND "a" typedef struct { PyObject_HEAD char *host; int port; char *user; char *group; hdfsFS _fs; } FsInfo; PyObject* FsClass_new(PyTypeObject* type, PyObject *args, PyObject *kwds); void FsClass_dealloc(FsInfo* self); int FsClass_init(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_close(FsInfo* self); PyObject* FsClass_get_working_directory(FsInfo* self); PyObject* FsClass_get_default_block_size(FsInfo* self); PyObject* FsClass_get_path_info(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_get_hosts(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_get_used(FsInfo* self); PyObject* FsClass_get_capacity(FsInfo* self); PyObject* FsClass_set_replication(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_set_working_directory(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_open_file(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_copy(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_exists(FsInfo* self, PyObject *args, PyObject *kwds); PyObject*FsClass_list_directory(FsInfo *self, PyObject *args, PyObject *kwds); PyObject* FsClass_create_directory(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_rename(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_move(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_delete(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_chmod(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_chown(FsInfo* self, PyObject *args, PyObject *kwds); PyObject* FsClass_utime(FsInfo* self, PyObject *args, PyObject *kwds); #endif ================================================ FILE: src/native_core_hdfs/hdfs_module.cc ================================================ /* BEGIN_COPYRIGHT * * Copyright 2009-2026 CRS4. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * END_COPYRIGHT */ #include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K 1 #endif #include "hdfs_fs.h" #include "hdfs_file.h" #include static char* module__name__ = "native_core_hdfs"; static char* module__doc__ = "native_hdfs_core implementation"; /* FsType */ static PyMemberDef FsClass_members[] = { {NULL} /* Sentinel */ }; static PyMethodDef FsClass_methods[] = { {"get_working_directory", (PyCFunction) FsClass_get_working_directory, METH_NOARGS, "Get the current working directory"}, {"get_path_info", (PyCFunction) FsClass_get_path_info, METH_VARARGS, "Get information on a file or directory"}, {"get_default_block_size", (PyCFunction) FsClass_get_default_block_size, METH_NOARGS, "Get the default block size"}, {"get_hosts", (PyCFunction) FsClass_get_hosts, METH_VARARGS, "Get the names of the hosts where a file is stored"}, {"get_capacity", (PyCFunction) FsClass_get_capacity, METH_VARARGS, "Get the raw capacity of the filesystem"}, {"get_used", (PyCFunction) FsClass_get_used, METH_NOARGS, "Get the total raw size of all files in the filesystem."}, {"set_replication", (PyCFunction) FsClass_set_replication, METH_VARARGS, "Set the replication factor for a file"}, {"set_working_directory", (PyCFunction) FsClass_set_working_directory, METH_VARARGS, "Set the current working directory"}, {"open_file", (PyCFunction) FsClass_open_file, METH_VARARGS, "Open a file"}, {"close", (PyCFunction) FsClass_close, METH_NOARGS, "Close the HDFS connection"}, {"copy", (PyCFunction) FsClass_copy, METH_VARARGS, "Copy the given file"}, {"create_directory", (PyCFunction) FsClass_create_directory, METH_VARARGS, "Create a directory with the given name"}, {"list_directory", (PyCFunction) FsClass_list_directory, METH_VARARGS, "Get the contents of a directory"}, {"move", (PyCFunction) FsClass_move, METH_VARARGS, "Move the given file"}, {"rename", (PyCFunction) FsClass_rename, METH_VARARGS, "Rename the given file"}, {"delete", (PyCFunction) FsClass_delete, METH_VARARGS, "Delete the given file or directory"}, {"exists", (PyCFunction) FsClass_exists, METH_VARARGS, "Check if the given path exists on the filesystem"}, {"chmod", (PyCFunction) FsClass_chmod, METH_VARARGS, "Change file mode"}, {"chown", (PyCFunction) FsClass_chown, METH_VARARGS, "Change file owner and group"}, {"utime", (PyCFunction) FsClass_utime, METH_VARARGS, "Change file last access and modification time"}, {NULL} /* Sentinel */ }; static PyTypeObject FsType = { PyVarObject_HEAD_INIT(NULL, 0) "native_core_hdfs.CoreHdfsFs", /* tp_name */ sizeof(FsInfo), /* tp_basicsize */ 0, /* tp_itemsize */ (destructor) FsClass_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ "Hdfs FS objects", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ FsClass_methods, /* tp_methods */ FsClass_members, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ (initproc) FsClass_init, /* tp_init */ 0, /* tp_alloc */ FsClass_new, /* tp_new */ }; /* FileType */ static PyMemberDef FileClass_members[] = { {NULL} /* Sentinel */ }; static PyGetSetDef FileClass_getseters[] = { {"closed", (getter)FileClass_getclosed, NULL, NULL}, {"buff_size", (getter)FileClass_getbuff_size, NULL, NULL}, {"name", (getter)FileClass_getname, NULL, NULL}, {"mode", (getter)FileClass_getmode, NULL, NULL}, {NULL} /* Sentinel */ }; static PyMethodDef FileClass_methods[] = { {"close", (PyCFunction)FileClass_close, METH_NOARGS, "Close the file"}, {"readable", (PyCFunction)FileClass_readable, METH_NOARGS, "True if the file can be read from"}, {"writable", (PyCFunction)FileClass_writable, METH_NOARGS, "True if the file can be written to"}, {"seekable", (PyCFunction)FileClass_seekable, METH_NOARGS, "True if the file support random access (it does if it's readable)"}, {"available", (PyCFunction) FileClass_available, METH_NOARGS, "Number of bytes that can be read without blocking"}, {"write", (PyCFunction)FileClass_write, METH_VARARGS, "Write to the file"}, {"flush", (PyCFunction) FileClass_flush, METH_NOARGS, "Force any buffered output to be written"}, {"read", (PyCFunction) FileClass_read, METH_VARARGS, "Read from the file"}, {"read_chunk", (PyCFunction) FileClass_read_chunk, METH_VARARGS, "Like read, but store data to the given buffer"}, /* Also export read_chunk as readinto for compatibility with Python io */ {"readinto", (PyCFunction) FileClass_read_chunk, METH_VARARGS, "Like read, but store data to the given buffer"}, {"pread", (PyCFunction) FileClass_pread, METH_VARARGS, "Read starting from the given position"}, {"pread_chunk", (PyCFunction) FileClass_pread_chunk, METH_VARARGS, "Like pread, but store data to the given buffer"}, {"seek", (PyCFunction) FileClass_seek, METH_VARARGS, "Seek to the given position"}, {"tell", (PyCFunction) FileClass_tell, METH_NOARGS, "Get the current position"}, {NULL} /* Sentinel */ }; static PyTypeObject FileType = { PyVarObject_HEAD_INIT(NULL, 0) "native_core_hdfs.CoreHdfsFile", /* tp_name */ sizeof(FileInfo), /* tp_basicsize */ 0, /* tp_itemsize */ (destructor)FileClass_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ "Hdfs File objects", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ FileClass_methods, /* tp_methods */ FileClass_members, /* tp_members */ FileClass_getseters, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ (initproc)FileClass_init, /* tp_init */ 0, /* tp_alloc */ FileClass_new, /* tp_new */ }; static PyMethodDef module_methods[] = { {NULL} /* Sentinel */ }; #ifndef PyMODINIT_FUNC /* declarations for DLL import/export */ #define PyMODINIT_FUNC void #endif #if IS_PY3K static struct PyModuleDef module_def = { PyModuleDef_HEAD_INIT, module__name__, /* m_name */ module__doc__, /* m_doc */ -1, /* m_size */ module_methods, /* m_methods */ NULL, /* m_reload */ NULL, /* m_traverse */ NULL, /* m_clear */ NULL, /* m_free */ }; #endif #if IS_PY3K PyMODINIT_FUNC PyInit_native_core_hdfs(void) { PyObject* m; if (PyType_Ready(&FsType) < 0) return NULL; if (PyType_Ready(&FileType) < 0) return NULL; m = PyModule_Create(&module_def); if (m == NULL) return NULL; Py_INCREF(&FsType); Py_INCREF(&FileType); PyModule_AddObject(m, "CoreHdfsFs", (PyObject *)&FsType); PyModule_AddObject(m, "CoreHdfsFile", (PyObject *)&FileType); return m; } #else PyMODINIT_FUNC initnative_core_hdfs(void) { PyObject* m; if (PyType_Ready(&FsType) < 0) return; if (PyType_Ready(&FileType) < 0) return; m = Py_InitModule3(module__name__, module_methods, module__doc__); if (m == NULL) return; Py_INCREF(&FsType); Py_INCREF(&FileType); PyModule_AddObject(m, "CoreHdfsFs", (PyObject *)&FsType); PyModule_AddObject(m, "CoreHdfsFile", (PyObject *)&FileType); PyModule_AddStringConstant(m, "MODE_READ", MODE_READ); PyModule_AddStringConstant(m, "MODE_WRITE", MODE_WRITE); PyModule_AddStringConstant(m, "MODE_APPEND", MODE_APPEND); } #endif ================================================ FILE: src/py3k_compat.h ================================================ /* BEGIN_COPYRIGHT * * Copyright 2009-2026 CRS4. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * END_COPYRIGHT */ #ifndef PY3K_COMPAT_H #define PY3K_COMPAT_H #if PY_MAJOR_VERSION >= 3 #define IS_PY3K 1 #endif #include "buf_macros.h" #include "Py_macros.h" #endif ================================================ FILE: src/sercore/HadoopUtils/SerialUtils.cc ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "SerialUtils.hh" #include #include #include #include #include using std::string; namespace HadoopUtils { Error::Error(const std::string& msg): error(msg) { } Error::Error(const std::string& msg, const std::string& file, int line, const std::string& function) { error = msg + " at " + file + ":" + std::to_string(line) + " in " + function; } const std::string& Error::getMessage() const { return error; } FileInStream::FileInStream() { mFile = NULL; isOwned = false; } bool FileInStream::open(const std::string& name) { mFile = fopen(name.c_str(), "rb"); isOwned = true; return (mFile != NULL); } bool FileInStream::open(FILE* file) { mFile = file; isOwned = false; return (mFile != NULL); } void FileInStream::read(void *buf, size_t len) { size_t result = fread(buf, len, 1, mFile); if (result == 0) { if (feof(mFile)) { HADOOP_ASSERT(false, "end of file"); } else { HADOOP_ASSERT(false, string("read error on file: ") + strerror(errno)); } } } bool FileInStream::skip(size_t nbytes) { return (0==fseek(mFile, nbytes, SEEK_CUR)); } bool FileInStream::close() { int ret = 0; if (mFile != NULL && isOwned) { ret = fclose(mFile); } mFile = NULL; return (ret==0); } FileInStream::~FileInStream() { if (mFile != NULL) { close(); } } FileOutStream::FileOutStream() { mFile = NULL; isOwned = false; } bool FileOutStream::open(const std::string& name, bool overwrite) { if (!overwrite) { mFile = fopen(name.c_str(), "rb"); if (mFile != NULL) { fclose(mFile); return false; } } mFile = fopen(name.c_str(), "wb"); isOwned = true; return (mFile != NULL); } bool FileOutStream::open(FILE* file) { mFile = file; isOwned = false; return (mFile != NULL); } void FileOutStream::write(const void* buf, size_t len) { size_t result = fwrite(buf, len, 1, mFile); HADOOP_ASSERT(result == 1, string("write error to file: ") + strerror(errno)); } bool FileOutStream::advance(size_t nbytes) { return (0==fseek(mFile, nbytes, SEEK_CUR)); } bool FileOutStream::close() { int ret = 0; if (mFile != NULL && isOwned) { ret = fclose(mFile); } mFile = NULL; return (ret == 0); } void FileOutStream::flush() { fflush(mFile); } FileOutStream::~FileOutStream() { if (mFile != NULL) { close(); } } StringInStream::StringInStream(const std::string& str): buffer(str) { itr = buffer.begin(); } void StringInStream::read(void *buf, size_t buflen) { size_t bytes = 0; char* output = (char*) buf; std::string::const_iterator end = buffer.end(); while (bytes < buflen) { output[bytes++] = *itr; ++itr; if (itr == end) { break; } } HADOOP_ASSERT(bytes == buflen, "unexpected end of string reached"); } void serializeInt(int32_t t, OutStream& stream) { serializeLong(t, stream); } void serializeLong(int64_t t, OutStream& stream) { if (t >= -112 && t <= 127) { int8_t b = t; stream.write(&b, 1); return; } int8_t len = -112; if (t < 0) { t ^= -1ll; // reset the sign bit len = -120; } uint64_t tmp = t; while (tmp != 0) { tmp = tmp >> 8; len--; } stream.write(&len, 1); len = (len < -120) ? -(len + 120) : -(len + 112); for (uint32_t idx = len; idx != 0; idx--) { uint32_t shiftbits = (idx - 1) * 8; uint64_t mask = 0xFFll << shiftbits; uint8_t b = (t & mask) >> shiftbits; stream.write(&b, 1); } } int32_t deserializeInt(InStream& stream) { return deserializeLong(stream); } int64_t deserializeLong(InStream& stream) { int8_t b; stream.read(&b, 1); if (b >= -112) { return b; } bool negative; int len; if (b < -120) { negative = true; len = -120 - b; } else { negative = false; len = -112 - b; } uint8_t barr[len]; stream.read(barr, len); int64_t t = 0; for (int idx = 0; idx < len; idx++) { t = t << 8; t |= (barr[idx] & 0xFF); } if (negative) { t ^= -1ll; } return t; } void serializeFloat(float t, OutStream& stream) { char buf[sizeof(float)]; XDR xdrs; xdrmem_create(&xdrs, buf, sizeof(float), XDR_ENCODE); xdr_float(&xdrs, &t); stream.write(buf, sizeof(float)); } float deserializeFloat(InStream& stream) { float f; deserializeFloat(f, stream); return f; } void deserializeFloat(float& t, InStream& stream) { char buf[sizeof(float)]; stream.read(buf, sizeof(float)); XDR xdrs; xdrmem_create(&xdrs, buf, sizeof(float), XDR_DECODE); xdr_float(&xdrs, &t); } void serializeString(const std::string& t, OutStream& stream) { serializeInt(t.length(), stream); if (t.length() > 0) { stream.write(t.data(), t.length()); } } void deserializeString(std::string& t, InStream& stream) { int32_t len = deserializeInt(stream); if (len > 0) { // resize the string to the right length t.resize(len); // read into the string in 64k chunks const int bufSize = 65536; int offset = 0; char buf[bufSize]; while (len > 0) { int chunkLength = len > bufSize ? bufSize : len; stream.read(buf, chunkLength); t.replace(offset, chunkLength, buf, chunkLength); offset += chunkLength; len -= chunkLength; } } else { t.clear(); } } } ================================================ FILE: src/sercore/HadoopUtils/SerialUtils.hh ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef HADOOP_SERIAL_UTILS_HH #define HADOOP_SERIAL_UTILS_HH #include #include namespace HadoopUtils { /** * A simple exception class that records a message for the user. */ class Error { private: std::string error; public: /** * Create an error object with the given message. */ Error(const std::string& msg); /** * Construct an error object with the given message that was created on * the given file, line, and functino. */ Error(const std::string& msg, const std::string& file, int line, const std::string& function); /** * Get the error message. */ const std::string& getMessage() const; }; /** * Check to make sure that the condition is true, and throw an exception * if it is not. The exception will contain the message and a description * of the source location. */ #define HADOOP_ASSERT(CONDITION, MESSAGE) \ { \ if (!(CONDITION)) { \ throw HadoopUtils::Error((MESSAGE), __FILE__, __LINE__, \ __func__); \ } \ } /** * An interface for an input stream. */ class InStream { public: /** * Reads len bytes from the stream into the buffer. * @param buf the buffer to read into * @param buflen the length of the buffer * @throws Error if there are problems reading */ virtual void read(void *buf, size_t len) = 0; virtual ~InStream() {} }; /** * An interface for an output stream. */ class OutStream { public: /** * Write the given buffer to the stream. * @param buf the data to write * @param len the number of bytes to write * @throws Error if there are problems writing */ virtual void write(const void *buf, size_t len) = 0; /** * Flush the data to the underlying store. */ virtual void flush() = 0; virtual ~OutStream() {} }; /** * A class to read a file as a stream. */ class FileInStream : public InStream { public: FileInStream(); bool open(const std::string& name); bool open(FILE* file); void read(void *buf, size_t buflen); bool skip(size_t nbytes); bool close(); virtual ~FileInStream(); private: /** * The file to write to. */ FILE *mFile; /** * Does is this class responsible for closing the FILE*? */ bool isOwned; }; /** * A class to write a stream to a file. */ class FileOutStream: public OutStream { public: /** * Create a stream that isn't bound to anything. */ FileOutStream(); /** * Create the given file, potentially overwriting an existing file. */ bool open(const std::string& name, bool overwrite); bool open(FILE* file); void write(const void* buf, size_t len); bool advance(size_t nbytes); void flush(); bool close(); virtual ~FileOutStream(); private: FILE *mFile; bool isOwned; }; /** * A stream that reads from a string. */ class StringInStream: public InStream { public: StringInStream(const std::string& str); virtual void read(void *buf, size_t buflen); private: const std::string& buffer; std::string::const_iterator itr; }; void serializeInt(int32_t t, OutStream& stream); int32_t deserializeInt(InStream& stream); void serializeLong(int64_t t, OutStream& stream); int64_t deserializeLong(InStream& stream); void serializeFloat(float t, OutStream& stream); void deserializeFloat(float& t, InStream& stream); float deserializeFloat(InStream& stream); void serializeString(const std::string& t, OutStream& stream); void deserializeString(std::string& t, InStream& stream); } #endif ================================================ FILE: src/sercore/hu_extras.cpp ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT #include "HadoopUtils/SerialUtils.hh" #define INT64_SIZE sizeof(int64_t) int64_t deserializeLongWritable(HadoopUtils::InStream& stream) { int64_t rval = 0; unsigned char bytes[INT64_SIZE]; stream.read(bytes, INT64_SIZE); for (std::size_t i = 0; i < INT64_SIZE; ++i) { rval = (rval << INT64_SIZE) | bytes[i]; } return rval; } ================================================ FILE: src/sercore/hu_extras.h ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT #pragma once #include "HadoopUtils/SerialUtils.hh" /** * Read a hadoop.io.LongWritable (java.io.DataInput.readLong). */ int64_t deserializeLongWritable(HadoopUtils::InStream& stream); ================================================ FILE: src/sercore/sercore.cpp ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT #include #include "hu_extras.h" #include "streams.h" const char* m_name = "sercore"; const char* m_doc = "core serialization utils"; #if PY_MAJOR_VERSION >= 3 #define PY3 #define INIT_RETURN(V) return V; #else #define INIT_RETURN(V) return; #endif // Deserializes a hadoop.(mapred|mapreduce.lib.input).FileSplit static PyObject * deserializeFileSplit(PyObject *self, PyObject *args) { PyObject *data, *rval; Py_buffer buffer = {NULL, NULL}; PyThreadState *state; if (!PyArg_ParseTuple(args, "O", &data)) { return NULL; } if (PyObject_GetBuffer(data, &buffer, PyBUF_SIMPLE) < 0) { PyErr_SetString(PyExc_TypeError, "data not accessible as a buffer"); return NULL; } // deserialize fields std::string s((const char*)buffer.buf, buffer.len); HadoopUtils::StringInStream stream(s); std::string fname; int64_t offset, length; state = PyEval_SaveThread(); try { HadoopUtils::deserializeString(fname, stream); offset = deserializeLongWritable(stream); length = deserializeLongWritable(stream); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyBuffer_Release(&buffer); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); return NULL; } PyEval_RestoreThread(state); PyBuffer_Release(&buffer); // build output tuple PyObject *_fname, *_offset, *_length; if (!(_fname = PyUnicode_FromStringAndSize(fname.c_str(), fname.size()))) { return NULL; } if (!(_offset = Py_BuildValue("L", offset))) { return NULL; } if (!(_length = Py_BuildValue("L", length))) { return NULL; } if (!(rval = PyTuple_New(3))) { return NULL; } PyTuple_SET_ITEM(rval, 0, _fname); PyTuple_SET_ITEM(rval, 1, _offset); PyTuple_SET_ITEM(rval, 2, _length); return rval; } static PyMethodDef SercoreMethods[] = { {"deserialize_file_split", deserializeFileSplit, METH_VARARGS, "deserialize_file_split(data): deserialize a Hadoop FileSplit"}, {NULL} }; #ifdef PY3 static struct PyModuleDef module_def = { PyModuleDef_HEAD_INIT, m_name, m_doc, 0, SercoreMethods, NULL, NULL, NULL, NULL }; #endif PyMODINIT_FUNC #ifdef PY3 PyInit_sercore(void) { #else initsercore(void) { #endif PyObject *m; FileInStreamType.tp_new = PyType_GenericNew; if (PyType_Ready(&FileInStreamType) < 0) { INIT_RETURN(NULL);; } FileOutStreamType.tp_new = PyType_GenericNew; if (PyType_Ready(&FileOutStreamType) < 0) { INIT_RETURN(NULL);; } #ifdef PY3 m = PyModule_Create(&module_def); #else m = Py_InitModule3(m_name, SercoreMethods, m_doc); #endif if (!m) { INIT_RETURN(NULL);; } Py_INCREF(&FileInStreamType); PyModule_AddObject(m, "FileInStream", (PyObject *)&FileInStreamType); Py_INCREF(&FileOutStreamType); PyModule_AddObject(m, "FileOutStream", (PyObject *)&FileOutStreamType); INIT_RETURN(m); } ================================================ FILE: src/sercore/streams.cpp ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT // WARNING: types defined here are **NOT** designed for inheritance. For // instance, FileInStream_readTuple calls other FileInStream_read* methods // directly at the C++ level. Since they are not part of the public API --- // exactly one input and one output stream are used in the pipes protocol, and // that's it --- we can make the code simpler and more efficient. #define PY_SSIZE_T_CLEAN // must be defined before including Python.h #include #include #include #include #include #include #include "hu_extras.h" #include "streams.h" #define OUTPUT 50 #define PARTITIONED_OUTPUT 51 // This can only be used in functions that return a PyObject* # define _ASSERT_STREAM_OPEN { \ if (self->closed) { \ PyErr_SetString(PyExc_ValueError, "I/O operation on closed stream"); \ return NULL; \ } \ } // PyFile_AsFile is only available in Python 2, for "old style" file objects // This should work on anything associated to a file descriptor FILE * _PyFile_AsFile(PyObject *f, const char* mode) { int fd, newfd; FILE *fp; PyThreadState *state; if ((fd = PyObject_AsFileDescriptor(f)) == -1) { return NULL; } state = PyEval_SaveThread(); if ((newfd = dup(fd)) == -1) { goto error; } if (!(fp = fdopen(newfd, mode))) { goto error; } PyEval_RestoreThread(state); return fp; error: PyEval_RestoreThread(state); PyErr_SetFromErrno(PyExc_IOError); return NULL; } static int FileInStream_init(FileInStreamObj *self, PyObject *args, PyObject *kwds) { const char *filename; PyThreadState *state; self->stream = std::make_shared(); if (PyArg_ParseTuple(args, "es", "utf-8", &filename)) { state = PyEval_SaveThread(); if (!self->stream->open(std::string(filename))) { PyEval_RestoreThread(state); PyErr_SetFromErrno(PyExc_IOError); PyMem_Free((void*)filename); return -1; } PyEval_RestoreThread(state); PyMem_Free((void*)filename); } else { PyErr_Clear(); PyObject *inarg; if (!PyArg_ParseTuple(args, "O", &inarg)) { return -1; } if (!(self->fp = _PyFile_AsFile(inarg, "rb"))) { return -1; } self->stream->open(self->fp); // this variant just stores a reference } self->closed = false; return 0; } static PyObject * FileInStream_close(FileInStreamObj *self) { PyThreadState *state; if (self->closed) { Py_RETURN_NONE; } state = PyEval_SaveThread(); if (self->fp) { fclose(self->fp); } bool res = self->stream->close(); if (!res) { PyEval_RestoreThread(state); return PyErr_SetFromErrno(PyExc_IOError); } PyEval_RestoreThread(state); self->closed = true; Py_RETURN_NONE; } static PyObject * FileInStream_enter(FileInStreamObj *self) { _ASSERT_STREAM_OPEN; Py_INCREF(self); return (PyObject*)self; } static PyObject * FileInStream_exit(FileInStreamObj *self, PyObject *args) { return FileInStream_close(self); } static PyObject * FileInStream_read(FileInStreamObj *self, PyObject *args) { size_t len; PyObject *rval; PyThreadState *state; _ASSERT_STREAM_OPEN; if (!PyArg_ParseTuple(args, "n", &len)) { return NULL; } if (!(rval = PyBytes_FromStringAndSize(NULL, len))) { return NULL; } state = PyEval_SaveThread(); try { self->stream->read(PyBytes_AS_STRING(rval), len); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); Py_DECREF(rval); return NULL; } PyEval_RestoreThread(state); return rval; } static PyObject * FileInStream_readVInt(FileInStreamObj *self) { int32_t rval; PyThreadState *state; _ASSERT_STREAM_OPEN; state = PyEval_SaveThread(); try { rval = HadoopUtils::deserializeInt(*self->stream); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); return NULL; } PyEval_RestoreThread(state); return Py_BuildValue("i", rval); } static PyObject * FileInStream_readVLong(FileInStreamObj *self) { int64_t rval; PyThreadState *state; _ASSERT_STREAM_OPEN; state = PyEval_SaveThread(); try { rval = HadoopUtils::deserializeLong(*self->stream); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); return NULL; } PyEval_RestoreThread(state); return Py_BuildValue("L", rval); } static PyObject * FileInStream_readFloat(FileInStreamObj *self) { float rval; PyThreadState *state; _ASSERT_STREAM_OPEN; state = PyEval_SaveThread(); try { rval = HadoopUtils::deserializeFloat(*self->stream); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); return NULL; } PyEval_RestoreThread(state); return PyFloat_FromDouble(rval); } std::string _FileInStream_read_cppstring(FileInStreamObj *self) { std::string rval; PyThreadState *state; state = PyEval_SaveThread(); try { HadoopUtils::deserializeString(rval, *self->stream); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); throw; } PyEval_RestoreThread(state); return rval; } static PyObject * FileInStream_readString(FileInStreamObj *self) { _ASSERT_STREAM_OPEN; std::string s; try { s = _FileInStream_read_cppstring(self); } catch (HadoopUtils::Error e) { return NULL; } return PyUnicode_FromStringAndSize(s.c_str(), s.size()); } static PyObject * FileInStream_readBytes(FileInStreamObj *self) { _ASSERT_STREAM_OPEN; std::string s; try { s = _FileInStream_read_cppstring(self); } catch (HadoopUtils::Error e) { return NULL; } return PyBytes_FromStringAndSize(s.c_str(), s.size()); } static PyObject * FileInStream_readTuple(FileInStreamObj *self, PyObject *args) { char *fmt; PyObject *rval; _ASSERT_STREAM_OPEN; if (!PyArg_ParseTuple(args, "s", &fmt)) { return NULL; } std::size_t nitems = strlen(fmt); if (!(rval = PyTuple_New(nitems))) { return NULL; } PyObject *item; for (std::size_t i = 0; i < nitems; ++i) { switch(fmt[i]) { case 'i': if (!(item = FileInStream_readVInt(self))) goto error; break; case 'l': if (!(item = FileInStream_readVLong(self))) goto error; break; case 'f': if (!(item = FileInStream_readFloat(self))) goto error; break; case 's': if (!(item = FileInStream_readString(self))) goto error; break; case 'b': if (!(item = FileInStream_readBytes(self))) goto error; break; default: Py_DECREF(rval); return PyErr_Format(PyExc_ValueError, "Unknown format '%c'", fmt[i]); } PyTuple_SET_ITEM(rval, i, item); } return rval; error: Py_DECREF(rval); return NULL; } static PyObject * FileInStream_skip(FileInStreamObj *self, PyObject *args) { size_t len; PyThreadState *state; _ASSERT_STREAM_OPEN; if (!PyArg_ParseTuple(args, "n", &len)) { return NULL; } state = PyEval_SaveThread(); bool res = self->stream->skip(len); if (!res) { PyEval_RestoreThread(state); return PyErr_SetFromErrno(PyExc_IOError); } PyEval_RestoreThread(state); Py_RETURN_NONE; } // Extra types not used directly by the protocol, but that may appear as a // result of serializing objects such as keys, values and input splits. // **NOTE**: within the command stream, each serialized object starts with a // VInt that specifies its length. For instance, to read a LongWritable key: // assert stream.read_vint() == 8 // key = stream.read_long_writable() // Equivalent, but probably less efficient: // key_bytes = stream.read_bytes() // assert len(key_bytes) == 8 // key = struct.unpack(">q", key_bytes)[0] static PyObject * FileInStream_readLongWritable(FileInStreamObj *self) { int64_t rval = 0; PyThreadState *state; _ASSERT_STREAM_OPEN; state = PyEval_SaveThread(); try { rval = deserializeLongWritable(*self->stream); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); return NULL; } PyEval_RestoreThread(state); return Py_BuildValue("L", rval); } static PyMethodDef FileInStream_methods[] = { {"close", (PyCFunction)FileInStream_close, METH_NOARGS, "close(): close the currently open file"}, {"read", (PyCFunction)FileInStream_read, METH_VARARGS, "read(len): read len bytes from the stream"}, {"read_vint", (PyCFunction)FileInStream_readVInt, METH_NOARGS, "read_vint(): read a variable length integer from the stream"}, {"read_vlong", (PyCFunction)FileInStream_readVLong, METH_NOARGS, "read_vlong(): read a variable length long integer from the stream"}, {"read_float", (PyCFunction)FileInStream_readFloat, METH_NOARGS, "read_float(): read a float from the stream"}, {"read_string", (PyCFunction)FileInStream_readString, METH_NOARGS, "read_string(): read a string from the stream"}, {"read_bytes", (PyCFunction)FileInStream_readBytes, METH_NOARGS, "read_bytes(): read a bytes object from the stream"}, {"read_tuple", (PyCFunction)FileInStream_readTuple, METH_VARARGS, "read_tuple(fmt): read len(fmt) values, where fmt specifies types"}, {"skip", (PyCFunction)FileInStream_skip, METH_VARARGS, "skip(len): skip len bytes"}, {"__enter__", (PyCFunction)FileInStream_enter, METH_NOARGS}, {"__exit__", (PyCFunction)FileInStream_exit, METH_VARARGS}, {"read_long_writable", (PyCFunction)FileInStream_readLongWritable, METH_NOARGS, "read_long_writable(): read a hadoop.io.LongWritable"}, {NULL} /* Sentinel */ }; PyTypeObject FileInStreamType = { PyVarObject_HEAD_INIT(NULL, 0) "sercore.FileInStream", /* tp_name */ sizeof(FileInStreamObj), /* tp_basicsize */ 0, /* tp_itemsize */ 0, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ "A class to read a file as a stream", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ FileInStream_methods, /* tp_methods */ 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ (initproc)FileInStream_init, /* tp_init */ 0, /* tp_alloc */ 0, /* tp_new */ }; static int FileOutStream_init(FileOutStreamObj *self, PyObject *args, PyObject *kwds) { const char *filename; PyThreadState *state; self->stream = std::make_shared(); if (PyArg_ParseTuple(args, "es", "utf-8", &filename)) { state = PyEval_SaveThread(); if (!self->stream->open(std::string(filename), true)) { PyEval_RestoreThread(state); PyErr_SetFromErrno(PyExc_IOError); PyMem_Free((void*)filename); return -1; } PyEval_RestoreThread(state); PyMem_Free((void*)filename); } else { PyErr_Clear(); PyObject *inarg; if (!PyArg_ParseTuple(args, "O", &inarg)) { return -1; } if (!(self->fp = _PyFile_AsFile(inarg, "wb"))) { return -1; } self->stream->open(self->fp); // this variant just stores a reference } self->closed = false; return 0; } static PyObject * FileOutStream_close(FileOutStreamObj *self) { PyThreadState *state; if (self->closed) { Py_RETURN_NONE; } state = PyEval_SaveThread(); if (self->fp) { fclose(self->fp); } bool res = self->stream->close(); if (!res) { PyEval_RestoreThread(state); return PyErr_SetFromErrno(PyExc_IOError); } PyEval_RestoreThread(state); self->closed = true; Py_RETURN_NONE; } static PyObject * FileOutStream_enter(FileOutStreamObj *self) { _ASSERT_STREAM_OPEN; Py_INCREF(self); return (PyObject*)self; } static PyObject * FileOutStream_exit(FileOutStreamObj *self, PyObject *args) { return FileOutStream_close(self); } static PyObject * FileOutStream_write(FileOutStreamObj *self, PyObject *args) { PyObject* data = NULL; Py_buffer buffer = {NULL, NULL}; PyThreadState *state; _ASSERT_STREAM_OPEN; if (!PyArg_ParseTuple(args, "O", &data)) { return NULL; } if (PyObject_GetBuffer(data, &buffer, PyBUF_SIMPLE) < 0) { PyErr_SetString(PyExc_TypeError, "data not accessible as a buffer"); return NULL; } state = PyEval_SaveThread(); try { self->stream->write(buffer.buf, buffer.len); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); return NULL; } PyEval_RestoreThread(state); Py_RETURN_NONE; } static PyObject * FileOutStream_writeVInt(FileOutStreamObj *self, PyObject *args) { int val = 0; PyThreadState *state; _ASSERT_STREAM_OPEN; if (!PyArg_ParseTuple(args, "i", &val)) { return NULL; } state = PyEval_SaveThread(); try { HadoopUtils::serializeInt(val, *self->stream); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); return NULL; } PyEval_RestoreThread(state); Py_RETURN_NONE; } static PyObject * FileOutStream_writeVLong(FileOutStreamObj *self, PyObject *args) { long long val; PyThreadState *state; _ASSERT_STREAM_OPEN; if (!PyArg_ParseTuple(args, "L", &val)) { return NULL; } state = PyEval_SaveThread(); try { HadoopUtils::serializeLong(val, *self->stream); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); return NULL; } PyEval_RestoreThread(state); Py_RETURN_NONE; } static PyObject * FileOutStream_writeFloat(FileOutStreamObj *self, PyObject *args) { float val; PyThreadState *state; _ASSERT_STREAM_OPEN; if (!PyArg_ParseTuple(args, "f", &val)) { return NULL; } state = PyEval_SaveThread(); try { HadoopUtils::serializeFloat(val, *self->stream); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); return NULL; } PyEval_RestoreThread(state); Py_RETURN_NONE; } static PyObject* _FileOutStream_write_cppstring(FileOutStreamObj *self, std::string s) { PyThreadState *state = PyEval_SaveThread(); try { HadoopUtils::serializeString(s, *self->stream); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); return NULL; } PyEval_RestoreThread(state); Py_RETURN_NONE; } static PyObject * FileOutStream_writeString(FileOutStreamObj *self, PyObject *args) { _ASSERT_STREAM_OPEN; #if PY_MAJOR_VERSION < 3 // default encoding is ASCII, so "s#" would not work here PyObject *pystring, *pybytes; if (!PyArg_ParseTuple(args, "O", &pystring)) { return NULL; } std::string s; if (PyBytes_Check(pystring)) { s = std::string(PyBytes_AS_STRING(pystring), PyBytes_GET_SIZE(pystring)); } else { if (!(pybytes = PyUnicode_AsUTF8String(pystring))) { return NULL; } s = std::string(PyBytes_AS_STRING(pybytes), PyBytes_GET_SIZE(pybytes)); Py_DECREF(pybytes); } return _FileOutStream_write_cppstring(self, s); #else const char* buf; Py_ssize_t len; if (!PyArg_ParseTuple(args, "s#", &buf, &len)) { return NULL; } return _FileOutStream_write_cppstring(self, std::string(buf, len)); #endif } static PyObject * FileOutStream_writeBytes(FileOutStreamObj *self, PyObject *args) { _ASSERT_STREAM_OPEN; #if PY_MAJOR_VERSION < 3 // "y#" not available PyObject *pyval, *rval; Py_buffer buffer = {NULL, NULL}; if (!PyArg_ParseTuple(args, "O", &pyval)) { return NULL; } if (PyObject_GetBuffer(pyval, &buffer, PyBUF_SIMPLE) < 0) { return NULL; } std::string s((const char*)buffer.buf, buffer.len); rval = _FileOutStream_write_cppstring(self, s); PyBuffer_Release(&buffer); return rval; #else const char* buf; Py_ssize_t len; if (!PyArg_ParseTuple(args, "y#", &buf, &len)) { return NULL; } return _FileOutStream_write_cppstring(self, std::string(buf, len)); #endif } static PyObject * FileOutStream_writeTuple(FileOutStreamObj *self, PyObject *args) { PyObject *inarg, *iterator, *item; char *fmt; _ASSERT_STREAM_OPEN; if (!PyArg_ParseTuple(args, "sO", &fmt, &inarg)) { return NULL; } if (!(iterator = PyObject_GetIter(inarg))) { return NULL; } for (std::size_t i = 0; i < strlen(fmt); ++i) { if (!(item = PyIter_Next(iterator))) { if (!PyErr_Occurred()) { PyErr_SetString(PyExc_ValueError, "not enough items"); } Py_DECREF(iterator); return NULL; } switch(fmt[i]) { case 'i': if (!FileOutStream_writeVInt(self, PyTuple_Pack(1, item))) goto error; break; case 'l': if (!FileOutStream_writeVLong(self, PyTuple_Pack(1, item))) goto error; break; case 'f': if (!FileOutStream_writeFloat(self, PyTuple_Pack(1, item))) goto error; break; case 's': if (!FileOutStream_writeString(self, PyTuple_Pack(1, item))) goto error; break; case 'b': if (!FileOutStream_writeBytes(self, PyTuple_Pack(1, item))) goto error; break; default: PyErr_Format(PyExc_ValueError, "Unknown format '%c'", fmt[i]); goto error; } Py_DECREF(item); } Py_DECREF(iterator); Py_RETURN_NONE; error: Py_DECREF(item); Py_DECREF(iterator); return NULL; } // Same as write_tuple("ibb", (OUTPUT, k, v)) or, when part is specified, // write_tuple("iibb", (PARTITIONED_OUTPUT, part, k, v)), but more efficient. // Optimizing other commands in this way is probably worthless. static PyObject * FileOutStream_writeOutput(FileOutStreamObj *self, PyObject *args) { int part = -1; PyThreadState *state; _ASSERT_STREAM_OPEN; #if PY_MAJOR_VERSION < 3 // "y#" not available PyObject *pykey, *pyval; Py_buffer kbuf = {NULL, NULL}; Py_buffer vbuf = {NULL, NULL}; if (!PyArg_ParseTuple(args, "OO|i", &pykey, &pyval, &part)) { return NULL; } if (PyObject_GetBuffer(pykey, &kbuf, PyBUF_SIMPLE) < 0) { return NULL; } if (PyObject_GetBuffer(pyval, &vbuf, PyBUF_SIMPLE) < 0) { PyBuffer_Release(&kbuf); return NULL; } std::string ks((const char*)kbuf.buf, kbuf.len); std::string vs((const char*)vbuf.buf, vbuf.len); #else const char *key, *val; Py_ssize_t klen, vlen; if (!PyArg_ParseTuple(args, "y#y#|i", &key, &klen, &val, &vlen, &part)) { return NULL; } std::string ks(key, klen); std::string vs(val, vlen); #endif state = PyEval_SaveThread(); try { if (part >= 0) { HadoopUtils::serializeInt(PARTITIONED_OUTPUT, *self->stream); HadoopUtils::serializeInt(part, *self->stream); } else { HadoopUtils::serializeInt(OUTPUT, *self->stream); } HadoopUtils::serializeString(ks, *self->stream); HadoopUtils::serializeString(vs, *self->stream); } catch (HadoopUtils::Error e) { PyEval_RestoreThread(state); PyErr_SetString(PyExc_IOError, e.getMessage().c_str()); #if PY_MAJOR_VERSION < 3 PyBuffer_Release(&kbuf); PyBuffer_Release(&vbuf); #endif return NULL; } PyEval_RestoreThread(state); #if PY_MAJOR_VERSION < 3 PyBuffer_Release(&kbuf); PyBuffer_Release(&vbuf); #endif Py_RETURN_NONE; } static PyObject * FileOutStream_advance(FileOutStreamObj *self, PyObject *args) { size_t len; PyThreadState *state; _ASSERT_STREAM_OPEN; if (!PyArg_ParseTuple(args, "n", &len)) { return NULL; } state = PyEval_SaveThread(); bool res = self->stream->advance(len); if (!res) { PyEval_RestoreThread(state); return PyErr_SetFromErrno(PyExc_IOError); } PyEval_RestoreThread(state); Py_RETURN_NONE; } static PyObject * FileOutStream_flush(FileOutStreamObj *self) { PyThreadState *state; _ASSERT_STREAM_OPEN; state = PyEval_SaveThread(); self->stream->flush(); PyEval_RestoreThread(state); Py_RETURN_NONE; } static PyMethodDef FileOutStream_methods[] = { {"close", (PyCFunction)FileOutStream_close, METH_NOARGS, "close(): close the currently open file"}, {"write", (PyCFunction)FileOutStream_write, METH_VARARGS, "write(data): write data to the stream"}, {"write_vint", (PyCFunction)FileOutStream_writeVInt, METH_VARARGS, "write_vint(n): write a variable length integer to the stream"}, {"write_vlong", (PyCFunction)FileOutStream_writeVLong, METH_VARARGS, "write_vlong(n): write a variable length long integer to the stream"}, {"write_float", (PyCFunction)FileOutStream_writeFloat, METH_VARARGS, "write_float(n): write a float to the stream"}, {"write_string", (PyCFunction)FileOutStream_writeString, METH_VARARGS, "write_string(n): write a string to the stream"}, {"write_bytes", (PyCFunction)FileOutStream_writeBytes, METH_VARARGS, "write_bytes(n): write a bytes object to the stream"}, {"write_tuple", (PyCFunction)FileOutStream_writeTuple, METH_VARARGS, "write_tuple(fmt, t): write values from iterable t according to fmt"}, {"write_output", (PyCFunction)FileOutStream_writeOutput, METH_VARARGS, "write_output(key, value[, part]): write pipes [partitioned] output"}, {"advance", (PyCFunction)FileOutStream_advance, METH_VARARGS, "advance(len): advance len bytes"}, {"flush", (PyCFunction)FileOutStream_flush, METH_NOARGS, "flush(): flush the stream"}, {"__enter__", (PyCFunction)FileOutStream_enter, METH_NOARGS}, {"__exit__", (PyCFunction)FileOutStream_exit, METH_VARARGS}, {NULL} /* Sentinel */ }; PyTypeObject FileOutStreamType = { PyVarObject_HEAD_INIT(NULL, 0) "sercore.FileOutStream", /* tp_name */ sizeof(FileOutStreamObj), /* tp_basicsize */ 0, /* tp_itemsize */ 0, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ "A class to write a stream to a file", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ FileOutStream_methods, /* tp_methods */ 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ (initproc)FileOutStream_init, /* tp_init */ 0, /* tp_alloc */ 0, /* tp_new */ }; ================================================ FILE: src/sercore/streams.h ================================================ // BEGIN_COPYRIGHT // // Copyright 2009-2026 CRS4. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // // END_COPYRIGHT #pragma once #include #include #include #include #include "HadoopUtils/SerialUtils.hh" typedef struct { PyObject_HEAD FILE *fp; bool closed; std::shared_ptr stream; } FileInStreamObj; typedef struct { PyObject_HEAD FILE *fp; bool closed; std::shared_ptr stream; } FileOutStreamObj; extern PyTypeObject FileInStreamType; extern PyTypeObject FileOutStreamType; ================================================ FILE: test/__init__.py ================================================ ================================================ FILE: test/all_tests.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest import os import importlib _TEST_DIRS = ( "app", "common", "mapreduce", "hdfs", # run these last, in case HDFS needs time to be fully up ) def suite(): suites = [] for dir_ in _TEST_DIRS: module = importlib.import_module("%s.%s" % (dir_, "all_tests")) sys.path.insert(0, dir_) path = [os.path.abspath("./%s" % dir_)] suites.append(getattr(module, "suite")(path)) sys.path.pop(0) return unittest.TestSuite(tuple(suites)) if __name__ == '__main__': import sys _RESULT = unittest.TextTestRunner(verbosity=2).run(suite()) sys.exit(not _RESULT.wasSuccessful()) ================================================ FILE: test/app/__init__.py ================================================ ================================================ FILE: test/app/all_tests.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest from pydoop.test_utils import get_module TEST_MODULE_NAMES = [ 'test_submit', ] def suite(path=None): suites = [] for module in TEST_MODULE_NAMES: suites.append(get_module(module, path).suite()) return unittest.TestSuite(suites) if __name__ == '__main__': import sys _RESULT = unittest.TextTestRunner(verbosity=2).run(suite()) sys.exit(not _RESULT.wasSuccessful()) ================================================ FILE: test/app/test_submit.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest import shutil import tempfile import os import re import sys from io import StringIO, BytesIO import pydoop.app.main as app from pydoop.app.submit import PydoopSubmitter def nop(x=None): pass class Args(object): def __init__(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) def __getattr__(self, _): """ If we don't have the requested attribute return None. """ return None class TestAppSubmit(unittest.TestCase): def setUp(self): self.submitter = PydoopSubmitter() @staticmethod def _gen_default_args(): return Args( entry_point='__main__', log_level='INFO', module='the_module', no_override_env=False, no_override_home=False, python_program='python', input="input_path", output="output_path", job_name="job_name", num_reducers=0, ) def test_help(self): parser = app.make_parser() # silence! for k in ['submit', 'script']: parser._actions[2].choices[k].format_help = nop parser._actions[2].choices[k].format_usage = nop parser._actions[2].choices[k].error = nop parser.format_help = nop parser.format_usage = nop parser.error = nop try: args, unk = parser.parse_known_args(['-h']) except SystemExit as e: self.assertEqual(e.args[0], 0) try: args, unk = parser.parse_known_args(['submit', '-h']) except SystemExit as e: self.assertEqual(e.args[0], 0) try: args, unk = parser.parse_known_args(['submit']) except SystemExit as e: self.assertEqual(e.args[0], 2) def _check_args(self, args, args_kv): for k, v in args_kv: k = re.sub("^--", "", k).replace('-', '_') self.assertTrue(hasattr(args, k)) v1 = getattr(args, k) if v is None: self.assertEqual(v1, True) elif type(v1) is list: pass else: self.assertEqual(v1, v) def test_conf_file(self): wd = tempfile.mkdtemp(prefix='pydoop_') conf_file = os.path.join(wd, 'pydoop.conf') args_kv = (("--pretend", None), ("--input-format", 'mapreduce.lib.input.TextInputFormat'), ("--output-format", 'mapreduce.lib.input.TextOutputFormat'), ("--num-reducers", 10), ) try: with open(conf_file, 'w') as cf: d = ''.join(['{}\n{}\n'.format(k, v) if v is not None else '{}\n'.format(k) for (k, v) in args_kv]) cf.write(d) parser = app.make_parser() parser.format_help = nop module = 'mymod1.mod2.mod3' ainput = 'input' aoutput = 'output' argv = ['submit', module, ainput, aoutput, '@' + conf_file] [args, unknown] = parser.parse_known_args(argv) self.assertEqual(args.module, module) self.assertEqual(args.input, ainput) self.assertEqual(args.output, aoutput) self.assertEqual(len(unknown), 0) self._check_args(args, args_kv) finally: shutil.rmtree(wd) def test_empty_param(self): parser = app.make_parser() parser.format_help = nop program = 'program' ainput = 'input' aoutput = 'output' argv = ['submit', '--module', '', program, ainput, aoutput] [args, unknown] = parser.parse_known_args(argv) self.assertEqual(args.module, '') def test_generate_pipes_code_env(self): args = self._gen_default_args() self.submitter.set_args(args) old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '') try: # we set this variable for this test since it may not be set in # the environment os.environ['LD_LIBRARY_PATH'] = '/test_path' code = self.submitter._generate_pipes_code() self.assertTrue('export PATH=' in code) self.assertTrue('export PYTHONPATH=' in code) self.assertTrue('export LD_LIBRARY_PATH="/test_path"' in code) finally: os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path def test_generate_pipes_code_no_override_ld_path(self): args = self._gen_default_args() args.no_override_ld_path = True self.submitter.set_args(args) old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '') try: os.environ['LD_LIBRARY_PATH'] = '/test_path' code = self.submitter._generate_pipes_code() self.assertTrue('export PYTHONPATH=' in code) self.assertFalse('export LD_LIBRARY_PATH=' in code) finally: os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path def test_generate_pipes_code_no_override_path(self): args = self._gen_default_args() args.no_override_path = True self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertTrue('export PYTHONPATH=' in code) self.assertFalse('export PATH=' in code) def test_generate_pipes_code_no_override_pythonpath(self): args = self._gen_default_args() args.no_override_pypath = True self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertTrue('export PYTHONPATH="${PWD}:${PYTHONPATH}"' in code) self.assertTrue('export PATH=' in code) def test_generate_pipes_code_with_set_env(self): args = self._gen_default_args() args.set_env = ["PATH=/my/custom/path"] self.submitter.set_args(args) old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '') try: os.environ['LD_LIBRARY_PATH'] = '/test_path' code = self.submitter._generate_pipes_code() self.assertTrue('export PATH="/my/custom/path"' in code) self.assertTrue('export PYTHONPATH=' in code) self.assertTrue('export LD_LIBRARY_PATH="/test_path"' in code) finally: os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path def test_generate_code_no_env_override(self): args = self._gen_default_args() args.no_override_env = True self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertFalse('export PATH=' in code) self.assertFalse('export LD_LIBRARY_PATH="/test_path"' in code) # PYTHONPATH should still be there because we add the hadoop # working directory self.assertTrue('export PYTHONPATH=' in code) def test_generate_code_no_env_override_with_set_env(self): args = self._gen_default_args() args.no_override_env = True args.set_env = ["PATH=/my/custom/path"] self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertTrue('export PATH="/my/custom/path"' in code) self.assertFalse('export LD_LIBRARY_PATH="/test_path"' in code) # PYTHONPATH should still be there because we add the hadoop # working directory self.assertTrue('export PYTHONPATH=' in code) def test_env_arg_to_dict(self): env_arg = ['var1=value1', ' var2 = value2 ', 'var3 = str with = sign'] d = self.submitter._env_arg_to_dict(env_arg) self.assertEquals('value1', d['var1']) self.assertEquals('value2', d['var2']) self.assertEquals('str with = sign', d['var3']) def test_bad_upload_files(self): args = self._gen_default_args() args.python_zip = [""] self.assertRaises(Exception, self.submitter.set_args, args) def test_pretend(self): args = self._gen_default_args() args.pretend = True args.log_level = "CRITICAL" stdout = sys.stdout sys.stdout = StringIO() if sys.version_info >= (3,) else BytesIO() self.submitter.set_args(args) self.submitter.run() captured = sys.stdout.getvalue() sys.stdout = stdout self.assertGreaterEqual(len(captured), 0) def suite(): suite_ = unittest.TestLoader().loadTestsFromTestCase(TestAppSubmit) return suite_ if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/avro/all_tests.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest from pydoop.test_utils import get_module TEST_MODULE_NAMES = [ 'test_io', ] def suite(path=None): suites = [] for module in TEST_MODULE_NAMES: suites.append(get_module(module, path).suite()) return unittest.TestSuite(suites) if __name__ == '__main__': import sys _RESULT = unittest.TextTestRunner(verbosity=2).run(suite()) sys.exit(not _RESULT.wasSuccessful()) ================================================ FILE: test/avro/common.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from pydoop.utils.py3compat import StringIO from avro.io import DatumWriter, BinaryEncoder class AvroSerializer(object): def __init__(self, schema): self.schema = schema self.datum_writer = DatumWriter(schema) def serialize(self, record): f = StringIO() encoder = BinaryEncoder(f) self.datum_writer.write(record, encoder) return f.getvalue() def avro_user_record(i): return { "office": 'office-%s' % i, "favorite_number": i, "favorite_color": 'color-%s' % i, "name": 'name-%s' % i, } ================================================ FILE: test/avro/test_io.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import os import unittest import itertools as it import avro.datafile as avdf from avro.io import DatumReader, DatumWriter from pydoop.mapreduce.api import FileSplit from pydoop.avrolib import ( SeekableDataFileReader, AvroReader, AvroWriter, parse ) from pydoop.test_utils import WDTestCase from pydoop.utils.py3compat import czip, cmap import pydoop.hdfs as hdfs from common import avro_user_record THIS_DIR = os.path.dirname(os.path.abspath(__file__)) class TestAvroIO(WDTestCase): def setUp(self): super(TestAvroIO, self).setUp() with open(os.path.join(THIS_DIR, "user.avsc")) as f: self.schema = parse(f.read()) def write_avro_file(self, rec_creator, n_samples, sync_interval): avdf.SYNC_INTERVAL = sync_interval self.assertEqual(avdf.SYNC_INTERVAL, sync_interval) fo = self._mkf('data.avro', mode='wb') with avdf.DataFileWriter(fo, DatumWriter(), self.schema) as writer: for i in range(n_samples): writer.append(rec_creator(i)) return fo.name def test_seekable(self): fn = self.write_avro_file(avro_user_record, 500, 1024) with open(fn, 'rb') as f: sreader = SeekableDataFileReader(f, DatumReader()) res = [t for t in czip(cmap( lambda _: f.tell(), it.repeat(1) ), sreader)] sreader.align_after(res[-1][0]) with self.assertRaises(StopIteration): r = next(sreader) sreader.align_after(0) r = next(sreader) self.assertEqual(r, res[0][1]) def offset_iterator(): s = -1 for o, r in res: sreader.align_after(o) t = f.tell() if t == s: continue s = t try: x = next(sreader) except StopIteration: return yield (t, x) i = 0 for xo, x in offset_iterator(): sreader.align_after(xo) for o, r in res[i:]: if o >= xo: self.assertEqual(x, r) break i += 1 def test_avro_reader(self): N = 500 fn = self.write_avro_file(avro_user_record, N, 1024) url = hdfs.path.abspath(fn, local=True) class FunkyCtx(object): def __init__(self, isplit): self.input_split = isplit def get_areader(offset, length): isplit = FileSplit(url, offset, length) ctx = FunkyCtx(isplit) return AvroReader(ctx) areader = get_areader(0, 14) file_length = areader.reader.file_length with self.assertRaises(StopIteration): next(areader) areader = get_areader(0, file_length) with SeekableDataFileReader(open(fn, 'rb'), DatumReader()) as sreader: for (o, a), s in czip(areader, sreader): self.assertEqual(a, s) mid_len = int(file_length / 2) lows = [x for x in get_areader(0, mid_len)] highs = [x for x in get_areader(mid_len, file_length)] self.assertEqual(N, len(lows) + len(highs)) def test_avro_writer(self): class FunkyCtx(object): def __init__(self_, job_conf): self_.job_conf = job_conf class AWriter(AvroWriter): schema = self.schema def emit(self_, key, value): self_.writer.append(key) ctx = FunkyCtx({ 'mapreduce.task.partition': 1, 'mapreduce.task.output.dir': hdfs.path.abspath(self.wd, local=True) }) awriter = AWriter(ctx) N = 10 for i in range(N): awriter.emit(avro_user_record(i), '') awriter.close() def suite(): suite_ = unittest.TestSuite() suite_.addTest(TestAvroIO('test_seekable')) suite_.addTest(TestAvroIO('test_avro_reader')) suite_.addTest(TestAvroIO('test_avro_writer')) return suite_ if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/avro/user.avsc ================================================ { "namespace": "example.avro", "type": "record", "name": "User", "fields": [ {"name": "office", "type": "string"}, {"name": "name", "type": "string"}, {"name": "favorite_number", "type": ["int", "null"]}, {"name": "favorite_color", "type": ["string", "null"]} ] } ================================================ FILE: test/common/__init__.py ================================================ ================================================ FILE: test/common/all_tests.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import unittest from pydoop.test_utils import get_module TEST_MODULE_NAMES = [ 'test_hadoop_utils', 'test_hadut', 'test_pydoop', ] def suite(path=None): suites = [] for module in TEST_MODULE_NAMES: suites.append(get_module(module, path).suite()) return unittest.TestSuite(suites) def main(): result = unittest.TextTestRunner(verbosity=2).run(suite()) sys.exit(not result.wasSuccessful()) if __name__ == '__main__': main() ================================================ FILE: test/common/test_hadoop_utils.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest import tempfile import os import shutil from xml.dom.minidom import getDOMImplementation DOM_IMPL = getDOMImplementation() import pydoop.hadoop_utils as hu class TestHadoopUtils(unittest.TestCase): def setUp(self): self.hadoop_home = tempfile.mkdtemp(prefix="pydoop_test_") self.hadoop_conf = os.path.join(self.hadoop_home, "conf") os.mkdir(self.hadoop_conf) self.orig_env = os.environ.copy() os.environ["HADOOP_CONF_DIR"] = self.hadoop_conf self.pf = hu.PathFinder() def tearDown(self): os.environ.clear() os.environ.update(self.orig_env) shutil.rmtree(self.hadoop_home) def test_get_hadoop_params(self): self.__check_params() self.__check_params('', {}) self.__check_params('', {}) doc = DOM_IMPL.createDocument(None, "configuration", None) self.__check_params(doc.toxml(), {}) root = doc.documentElement prop = root.appendChild(doc.createElement("property")) self.__check_params(doc.toxml(), {}) for s in "name", "value": n = prop.appendChild(doc.createElement(s)) n.appendChild(doc.createTextNode(s.upper())) self.__check_params(doc.toxml(), {"NAME": "VALUE"}) def __check_params(self, xml_content=None, expected=None): if expected is None: expected = {} xml_fn = os.path.join(self.hadoop_conf, "core-site.xml") if os.path.exists(xml_fn): os.remove(xml_fn) if xml_content is not None: with open(xml_fn, "w") as fo: fo.write(xml_content) params = self.pf.hadoop_params() self.assertEqual(params, expected) def suite(): suite = unittest.TestSuite() suite.addTest(TestHadoopUtils('test_get_hadoop_params')) return suite if __name__ == '__main__': runner = unittest.TextTestRunner(verbosity=2) runner.run((suite())) ================================================ FILE: test/common/test_hadut.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT # pylint: disable=W0212 """ Test suite for pydoop.hadut """ import subprocess import unittest import pydoop.hadut as hadut def pair_set(seq): return set((seq[i], seq[i + 1]) for i in range(0, len(seq), 2)) class TestHadut(unittest.TestCase): CHECKNATIVE_OUT, _ = subprocess.Popen( ["hadoop", "checknative"], universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ).communicate() def assertEqualPairSet(self, seq1, seq2): return self.assertEqual(pair_set(seq1), pair_set(seq2)) def test_pop_generic_args(self): self.assertRaises(ValueError, hadut._pop_generic_args, ['-fs']) args = [ '-input', 'i', '-libjars', 'l', '-fs', 'f', '-output', 'o', '-jar', 'pippo' ] gargs = hadut._pop_generic_args(args) self.assertEqualPairSet(gargs, ['-libjars', 'l', '-fs', 'f']) self.assertEqualPairSet( args, ['-input', 'i', '-output', 'o', '-jar', 'pippo'] ) def test_merge_csv_args(self): self.assertRaises(ValueError, hadut._merge_csv_args, ['-archives']) args = [ '-libjars', 'l1', '-fs', 'f', '-libjars', 'l2', '-files', 'pippo', ] hadut._merge_csv_args(args) try: self.assertEqualPairSet( args, ['-libjars', 'l1,l2', '-fs', 'f', '-files', 'pippo'] ) except AssertionError: self.assertEqualPairSet( args, ['-libjars', 'l2,l1', '-fs', 'f', '-files', 'pippo'] ) def test_cmd(self): out = hadut.run_cmd("checknative", keep_streams=True) self.assertEqual(out, self.CHECKNATIVE_OUT) def test_run_class(self): out = hadut.run_class( "org.apache.hadoop.util.NativeLibraryChecker", keep_streams=True ) self.assertEqual(out, self.CHECKNATIVE_OUT) def suite(): suite_ = unittest.TestSuite() suite_.addTest(TestHadut('test_pop_generic_args')) suite_.addTest(TestHadut('test_merge_csv_args')) suite_.addTest(TestHadut('test_cmd')) suite_.addTest(TestHadut('test_run_class')) return suite_ if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/common/test_pydoop.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Test suite for top-level functions. """ import unittest import os import tempfile import shutil from imp import reload import pydoop class TestPydoop(unittest.TestCase): def setUp(self): self.wd = tempfile.mkdtemp(prefix='pydoop_test_') self.old_vars = { 'HADOOP_HOME': os.getenv('HADOOP_HOME'), 'HADOOP_CONF_DIR': os.getenv('HADOOP_CONF_DIR'), } def tearDown(self): for k, v in self.old_vars.items(): if v: os.environ[k] = v else: os.environ.pop(k, None) reload(pydoop) shutil.rmtree(self.wd) def test_home(self): old_home = pydoop.hadoop_home() if os.path.isdir(old_home): new_home = os.path.join(self.wd, 'hadoop') os.symlink(old_home, new_home) os.environ['HADOOP_HOME'] = new_home reload(pydoop) self.assertEqual(pydoop.hadoop_home(), new_home) def test_conf(self): old_conf = pydoop.hadoop_conf() new_conf = os.path.join(self.wd, "conf") shutil.copytree(old_conf, new_conf) os.environ['HADOOP_CONF_DIR'] = new_conf reload(pydoop) self.assertEqual(pydoop.hadoop_conf(), new_conf) def test_pydoop_jar_path(self): jar_path = pydoop.jar_path() if jar_path is not None: self.assertTrue(os.path.exists(jar_path)) directory, filename = os.path.split(jar_path) self.assertEqual(filename, pydoop.jar_name()) self.assertEqual('pydoop', os.path.basename(directory)) def suite(): suite_ = unittest.TestSuite() suite_.addTest(TestPydoop('test_home')) suite_.addTest(TestPydoop('test_conf')) suite_.addTest(TestPydoop('test_pydoop_jar_path')) return suite_ if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/common/test_test_support.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """\ Test suite for pydoop.test_support """ import unittest import uuid import os import pydoop.test_support as pts TARGET_CODE = """\ #!/foo/bar/python from __future__ import print_function from future import whatever import foobar print("Hello, world") """ class TestTestSupport(unittest.TestCase): def test_inject_code(self): lines = TARGET_CODE.splitlines() new_code = uuid.uuid4().hex ret = pts.inject_code(new_code, TARGET_CODE) ret_lines = ret.splitlines() self.assertEqual(ret_lines[:3], lines[:3]) self.assertTrue(new_code in ret_lines[3:-4]) self.assertEqual(ret_lines[-4:], lines[-4:]) def test_set_python_cmd(self): cmd = "/usr/bin/python3" ret = pts.set_python_cmd(TARGET_CODE, cmd) self.assertEqual(ret.split(os.linesep, 1)[0], "#!%s" % cmd) def suite(): suite_ = unittest.TestSuite() suite_.addTest(TestTestSupport('test_inject_code')) suite_.addTest(TestTestSupport('test_set_python_cmd')) return suite_ if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/hdfs/__init__.py ================================================ ================================================ FILE: test/hdfs/all_tests.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest from pydoop.test_utils import get_module TEST_MODULE_NAMES = [ 'test_core', 'test_local_fs', 'test_hdfs_fs', 'test_path', 'test_hdfs', ] def suite(path=None): suites = [] for module in TEST_MODULE_NAMES: suites.append(get_module(module, path).suite()) return unittest.TestSuite(suites) if __name__ == '__main__': import sys _RESULT = unittest.TextTestRunner(verbosity=2).run(suite()) sys.exit(not _RESULT.wasSuccessful()) ================================================ FILE: test/hdfs/common_hdfs_tests.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import sys import os import unittest import uuid import shutil import operator import array from ctypes import create_string_buffer import pydoop.hdfs as hdfs import pydoop.test_utils as utils from pydoop.utils.py3compat import _is_py3 class TestCommon(unittest.TestCase): def __init__(self, target, hdfs_host='', hdfs_port=0): unittest.TestCase.__init__(self, target) self.hdfs_host = hdfs_host self.hdfs_port = hdfs_port def setUp(self): self.fs = hdfs.hdfs(self.hdfs_host, self.hdfs_port) self.wd = utils.make_wd(self.fs) def tearDown(self): self.fs.delete(self.wd) self.fs.close() def _make_random_path(self, where=None, add_uni=True): rval = "%s/%s" % (where or self.wd, uuid.uuid4().hex) if add_uni: rval = "%s_%s" % (rval, utils.UNI_CHR) return rval # also an implicit test for the create_directory method def _make_random_dir(self, where=None, add_uni=True): path = self._make_random_path(where=where, add_uni=add_uni) self.fs.create_directory(path) self.assertTrue(self.fs.exists(path)) return path # also an implicit test for the write method def _make_random_file(self, where=None, content=None, **kwargs): kwargs["mode"] = "w" content = content or utils.make_random_data(printable=True) path = self._make_random_path(where=where) with self.fs.open_file(path, **kwargs) as fo: i = 0 bytes_written = 0 bufsize = 24 * 1024 * 1024 while i < len(content): bytes_written += fo.write(content[i: i + bufsize]) i += bufsize self.assertEqual(bytes_written, len(content)) return path def failUnlessRaisesExternal(self, excClass, callableObj, *args, **kwargs): utils.silent_call( self.failUnlessRaises, excClass, callableObj, *args, **kwargs ) assertRaisesExternal = failUnlessRaisesExternal def assertEqualPathInfo(self, info1, info2, tolerance=10): """\ Check path info results for equality. Since ``last_access`` values are timestamps in seconds, we need to tolerate a small difference between them. In practice, unless the expected value is stored way in advance, this difference should be 0 or 1 second. """ self.assertEqual(info1.keys(), info2.keys()) for (k, v1) in info1.items(): v2 = info2[k] if k == "last_access": self.assertLessEqual(abs(v2 - v1), tolerance) else: self.assertEqual(v1, v2) def open_close(self): path = self._make_random_path() self.fs.open_file(path, "w").close() with self.fs.open_file(path, "r") as f: self.assertFalse(f.closed) self.assertTrue(f.closed) self.assertRaises(ValueError, f.read) path = self._make_random_path() self.assertRaisesExternal(IOError, self.fs.open_file, path, "r") self.assertRaises(ValueError, self.fs.open_file, "") def delete(self): parent = self._make_random_dir() path = self._make_random_dir(where=parent) for i in False, True: fn = self._make_random_file(where=path) self.fs.delete(fn, recursive=i) self.assertFalse(self.fs.exists(fn)) self._make_random_file(where=path) self.fs.delete(path, recursive=True) self.assertFalse(self.fs.exists(path)) self.fs.delete(parent, recursive=False) self.assertFalse(self.fs.exists(parent)) self.assertRaises(ValueError, self.fs.delete, "") def copy(self): local_fs = hdfs.hdfs('', 0) local_wd = utils.make_wd(local_fs) from_path = os.path.join(local_wd, uuid.uuid4().hex) content = uuid.uuid4().bytes with open(from_path, "wb") as f: f.write(content) to_path = self._make_random_file() local_fs.copy(from_path, self.fs, to_path) self.assertRaises(ValueError, local_fs.copy, "", self.fs, "") local_fs.close() with self.fs.open_file(to_path) as f: self.assertEqual(f.read(), content) shutil.rmtree(local_wd) def move(self): content = utils.make_random_data(printable=True) from_path = self._make_random_file(content=content) to_path = self._make_random_path() self.fs.move(from_path, self.fs, to_path) self.assertFalse(self.fs.exists(from_path)) with self.fs.open_file(to_path) as f: self.assertEqual(f.read(), content) self.assertRaises(ValueError, self.fs.move, "", self.fs, "") def chmod(self): new_perm = 0o777 path = self._make_random_dir() old_perm = self.fs.get_path_info(path)["permissions"] assert old_perm != new_perm self.fs.chmod(path, new_perm) self.assertEqual(self.fs.get_path_info(path)["permissions"], new_perm) self.fs.chmod(path, old_perm) self.assertEqual(self.fs.get_path_info(path)["permissions"], old_perm) self.assertRaises(ValueError, self.fs.chmod, "", new_perm) def __set_and_check_perm(self, path, new_mode, expected_mode): self.fs.chmod(path, new_mode) perm = self.fs.get_path_info(path)["permissions"] self.assertEqual(expected_mode, perm) def chmod_w_string(self): path = self._make_random_dir() self.fs.chmod(path, 0o500) # each user self.__set_and_check_perm(path, "u+w", 0o700) self.__set_and_check_perm(path, "g+w", 0o720) self.__set_and_check_perm(path, "o+w", 0o722) # each permission mode self.__set_and_check_perm(path, "o+r", 0o726) self.__set_and_check_perm(path, "o+x", 0o727) # subtract operation, and multiple permission modes self.__set_and_check_perm(path, "o-rwx", 0o720) # multiple users self.__set_and_check_perm(path, "ugo-rwx", 0o000) # 'a' user self.__set_and_check_perm(path, "a+r", 0o444) # blank user -- should respect the user's umask umask = os.umask(0o007) self.fs.chmod(path, "+w") perm = self.fs.get_path_info(path)["permissions"] os.umask(umask) self.assertEqual(0o664, perm) # assignment op self.__set_and_check_perm(path, "a=rwx", 0o777) def file_attrs(self): path = self._make_random_path() content = utils.make_random_data() for mode in "wb", "wt": with self.fs.open_file(path, mode) as f: self.assertTrue(f.name.endswith(path)) self.assertTrue(f.fs is self.fs) self.assertEqual(f.size, 0) self.assertEqual(f.mode, mode) self.assertTrue(f.writable()) f.write(content if mode == "wb" else content.decode("utf-8")) self.assertEqual(f.size, len(content)) for mode in "rb", "rt": with self.fs.open_file(path, mode) as f: self.assertTrue(f.name.endswith(path)) self.assertTrue(f.fs is self.fs) self.assertEqual(f.size, len(content)) self.assertEqual(f.mode, mode) self.assertFalse(f.writable()) def flush(self): path = self._make_random_path() with self.fs.open_file(path, "w") as f: f.write(utils.make_random_data()) f.flush() def available(self): content = utils.make_random_data() path = self._make_random_file(content=content) with self.fs.open_file(path) as f: self.assertEqual(len(content), f.available()) def get_path_info(self): content = utils.make_random_data() path = self._make_random_file(content=content) info = self.fs.get_path_info(path) self.__check_path_info(info, kind="file", size=len(content)) self.assertTrue(info['name'].endswith(path)) path = self._make_random_dir() info = self.fs.get_path_info(path) self.__check_path_info(info, kind="directory") self.assertTrue(info['name'].endswith(path)) self.assertRaises( IOError, self.fs.get_path_info, self._make_random_path() ) self.assertRaises(ValueError, self.fs.get_path_info, "") def read(self): content = utils.make_random_data() path = self._make_random_file(content=content) with self.fs.open_file(path) as f: self.assertEqual(f.read(), content) with self.fs.open_file(path) as f: self.assertEqual(f.read(-1), content) with self.fs.open_file(path) as f: self.assertEqual(f.read(3), content[:3]) self.assertEqual(f.read(3), content[3:6]) if not _is_py3 and not self.fs.host: self.assertRaises(ValueError, f.write, content) else: self.assertRaises(IOError, f.write, content) def __read_chunk(self, chunk_factory): content = utils.make_random_data() path = self._make_random_file(content=content) size = len(content) for chunk_size in size - 1, size, size + 1: with self.fs.open_file(path) as f: chunk = chunk_factory(chunk_size) bytes_read = f.read_chunk(chunk) self.assertEqual(bytes_read, min(size, chunk_size)) self.assertEqual(bytes(bytearray(chunk))[:bytes_read], content[:bytes_read]) def read_chunk(self): def array_by_len(length): return array.array("b", b"\x00" * length) for factory in bytearray, create_string_buffer, array_by_len: self.__read_chunk(factory) def write(self): content = utils.make_random_data() path = self._make_random_path() with self.fs.open_file(path, "w") as fo: bytes_written = fo.write(content) self.assertEqual(bytes_written, len(content)) with self.fs.open_file(path) as fo: self.assertEqual(content, fo.read()) with self.fs.open_file(path, "w") as fo: bytes_written = fo.write(bytearray(content)) self.assertEqual(bytes_written, len(content)) with self.fs.open_file(path) as fo: self.assertEqual(content, fo.read()) chunk = create_string_buffer(content, len(content)) with self.fs.open_file(path, "w") as fo: bytes_written = fo.write(chunk) self.assertEqual(bytes_written, len(content)) def append(self): replication = 1 # see https://issues.apache.org/jira/browse/HDFS-3091 content, update = utils.make_random_data(), utils.make_random_data() path = self._make_random_path() with self.fs.open_file(path, "w", replication=replication) as fo: fo.write(content) try: with utils.silent_call(self.fs.open_file, path, "a") as fo: fo.write(update) except IOError: sys.stderr.write("NOT SUPPORTED ... ") return else: with self.fs.open_file(path) as fi: self.assertEqual(fi.read(), content + update) def tell(self): offset = 3 path = self._make_random_file() with self.fs.open_file(path) as f: f.read(offset) self.assertEqual(f.tell(), offset) def pread(self): content = utils.make_random_data() offset, length = 2, 3 path = self._make_random_file(content=content) with self.fs.open_file(path) as f: self.assertEqual( f.pread(offset, length), content[offset: offset + length] ) self.assertEqual(f.tell(), 0) self.assertEqual(content[1:], f.pread(1, -1)) self.assertRaises(IOError, f.pread, -1, 10) # read starting past end of file self.assertRaises(IOError, f.pread, len(content) + 1, 10) # read past end of file buf = f.pread(len(content) - 2, 10) self.assertEqual(2, len(buf)) def pread_chunk(self): content = utils.make_random_data() offset, length = 2, 3 chunk = create_string_buffer(length) path = self._make_random_file(content=content) with self.fs.open_file(path) as f: bytes_read = f.pread_chunk(offset, chunk) self.assertEqual(bytes_read, length) self.assertEqual(chunk.value, content[offset: offset + length]) self.assertEqual(f.tell(), 0) def copy_on_self(self): content = utils.make_random_data() path = self._make_random_file(content=content) path1 = self._make_random_path() self.fs.copy(path, self.fs, path1) with self.fs.open_file(path1) as f: self.assertEqual(f.read(), content) def rename(self): old_path = self._make_random_file() new_path = self._make_random_path() self.fs.rename(old_path, new_path) self.assertTrue(self.fs.exists(new_path)) self.assertFalse(self.fs.exists(old_path)) self.assertRaises(ValueError, self.fs.rename, old_path, "") self.assertRaises(ValueError, self.fs.rename, "", new_path) def change_dir(self): cwd = self.fs.working_directory() new_d = self._make_random_path() # does not need to exist self.fs.set_working_directory(new_d) self.assertEqual(self.fs.working_directory(), new_d) self.fs.set_working_directory(cwd) self.assertEqual(self.fs.working_directory(), cwd) self.assertRaises(ValueError, self.fs.set_working_directory, "") def list_directory(self): new_d = self._make_random_dir() self.assertEqual(self.fs.list_directory(new_d), []) paths = [self._make_random_file(where=new_d) for _ in range(3)] paths.sort(key=os.path.basename) infos = self.fs.list_directory(new_d) infos.sort(key=lambda p: os.path.basename(p["name"])) self.assertEqual(len(infos), len(paths)) for i, p in zip(infos, paths): self.__check_path_info(i, kind="file") self.assertTrue(i['name'].endswith(p)) self.assertRaises( IOError, self.fs.list_directory, self._make_random_path() ) self.assertRaises(ValueError, self.fs.list_directory, "") def __check_readline(self, get_lines): samples = [ b"foo\nbar\n\ntar", b"\nfoo\nbar\n\ntar", b"foo\nbar\n\ntar\n", b"\n\n\n", b"\n", b"", b"foobartar", ] path = self._make_random_path() for text in samples: expected_lines = text.splitlines(True) with self.fs.open_file(path, "w") as f: f.write(text) with self.fs.open_file(path) as f: lines = get_lines(f) self.assertEqual(lines, expected_lines) def readline(self): def get_lines(f): lines = [] while 1: line = f.readline() if not line: break lines.append(line) return lines self.__check_readline(get_lines) def readline_big(self): for i in range(10, 23): x = b"*" * (2**i) + b"\n" path = self._make_random_file(content=x) with self.fs.open_file(path) as f: line = f.readline() self.assertEqual( line, x, "len(a) = %d, len(x) = %d" % (len(line), len(x)) ) def readline_and_read(self): content = b"first line\nsecond line\n" path = self._make_random_file(content=content) chunks = [] with self.fs.open_file(path) as f: chunks.append(f.read(1)) chunks.append(f.readline()) chunks.append(f.read(4)) self.assertEqual(chunks, [b'f', b'irst line\n', b'seco']) def iter_lines(self): def get_lines_explicit(f): lines = [] while 1: try: lines.append(next(f)) except StopIteration: break return lines def get_lines_implicit(f): return [l for l in f] for fun in get_lines_explicit, get_lines_implicit: self.__check_readline(fun) def seek(self): lines = [b"1\n", b"2\n", b"3\n"] data = b"".join(lines) path = self._make_random_path() with self.fs.open_file(path, "w") as f: f.write(data) with self.fs.open_file(path) as f: for i, l in enumerate(lines): f.seek(sum(map(len, lines[:i]))) self.assertEqual(f.readline(), l) f.seek(0) self.assertEqual(f.readline(), lines[0]) f.seek(sum(map(len, lines[:i]))) self.assertEqual(f.readline(), l) with self.fs.open_file(path) as f: f.seek(1) f.seek(1, os.SEEK_CUR) self.assertEqual(f.tell(), 2) f.seek(-1, os.SEEK_END) self.assertEqual(f.tell(), len(data) - 1) # seek past end of file self.assertRaises(IOError, f.seek, len(data) + 10) def block_boundary(self): path = self._make_random_path() CHUNK_SIZE = 10 N = 2 # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576 bs = max(1048576, N * utils.get_bytes_per_checksum()) total_data_size = 2 * bs with self.fs.open_file(path, "w", blocksize=bs) as f: i = 0 bufsize = 12 * 1024 * 1024 while i < total_data_size: data = b'X' * min(bufsize, total_data_size - i) f.write(data) i += bufsize with self.fs.open_file(path) as f: p = total_data_size - CHUNK_SIZE for pos in (0, 1, bs - 1, bs, bs + 1, p - 1, p, p + 1, total_data_size - 1): expected_len = ( CHUNK_SIZE if pos <= p else total_data_size - pos ) f.seek(pos) chunk = f.read(CHUNK_SIZE) self.assertEqual(len(chunk), expected_len) def walk(self): new_d, new_f = self._make_random_dir(), self._make_random_file() for top in new_d, new_f: items = list(self.fs.walk(top)) self.assertEqual(len(items), 1) self.assertEqualPathInfo(items[0], self.fs.get_path_info(top)) top = new_d cache = [top] for _ in range(2): cache.append(self._make_random_file(where=top)) parent = self._make_random_dir(where=top) cache.append(parent) for _ in range(2): cache.append(self._make_random_file(where=parent)) child = self._make_random_dir(where=parent) cache.append(child) for _ in range(2): cache.append(self._make_random_file(where=child)) infos = list(self.fs.walk(top)) expected_infos = [self.fs.get_path_info(p) for p in cache] self.assertEqual(len(infos), len(expected_infos)) for l in infos, expected_infos: l.sort(key=operator.itemgetter("name")) for i, e in zip(infos, expected_infos): self.assertEqualPathInfo(i, e) if not _is_py3: # check it's OK for "top" to be a bytes string b_top = self._make_random_dir(add_uni=False) try: b_top = b_top.encode() except Exception: pass list(self.fs.walk(b_top)) nonexistent_walk = self.fs.walk(self._make_random_path()) if _is_py3: self.assertRaises(OSError, lambda: next(nonexistent_walk)) else: self.assertRaises(IOError, lambda: next(nonexistent_walk)) for top in '', None: self.assertRaises(ValueError, lambda: next(self.fs.walk(top))) def exists(self): self.assertFalse(self.fs.exists('some_file')) self.assertFalse(self.fs.exists('some_file/other_file')) dname = self._make_random_dir() self.assertTrue(self.fs.exists(dname)) fname = self._make_random_file() self.assertTrue(self.fs.exists(fname)) self.assertRaises(ValueError, self.fs.exists, "") def text_io(self): t_path, b_path = self._make_random_path(), self._make_random_path() text = u'a string' + utils.UNI_CHR data = text.encode("utf-8") with self.fs.open_file(t_path, "wt") as fo: chars_written = fo.write(text) with self.fs.open_file(b_path, "w") as fo: bytes_written = fo.write(data) self.assertEqual(chars_written, len(text)) self.assertEqual(bytes_written, len(data)) with self.fs.open_file(t_path, "rt") as f: self.assertEqual(f.read(), text) f.seek(2) self.assertEqual(f.read(), text[2:]) self.assertEqual(f.pread(3, 4), text[3:7]) with self.assertRaises(AttributeError): f.read_chunk("") f.pread_chunk(1, "") with self.fs.open_file(b_path, "r") as f: self.assertEqual(f.read(), data) def __check_path_info(self, info, **expected_values): keys = ('kind', 'group', 'name', 'last_mod', 'replication', 'owner', 'permissions', 'block_size', 'last_access', 'size') for k in keys: self.assertTrue(k in info) for k, exp_v in list(expected_values.items()): v = info[k] self.assertEqual(v, exp_v) def common_tests(): return [ 'open_close', 'delete', 'copy', 'move', 'chmod', 'chmod_w_string', 'file_attrs', 'flush', 'read', 'read_chunk', 'write', 'append', 'tell', 'pread', 'pread_chunk', 'rename', 'change_dir', 'copy_on_self', 'available', 'get_path_info', 'list_directory', 'readline', 'readline_big', 'readline_and_read', 'iter_lines', 'seek', 'block_boundary', 'walk', 'exists', 'text_io', ] ================================================ FILE: test/hdfs/test_common.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest from pydoop.hdfs.common import parse_mode class TestMode(unittest.TestCase): def runTest(self): for mode in "r", "rb": self.assertEqual(parse_mode(mode), ("r", False)) for mode in "w", "wb": self.assertEqual(parse_mode(mode), ("w", False)) for mode in "a", "ab": self.assertEqual(parse_mode(mode), ("a", False)) self.assertEqual(parse_mode("rt"), ("r", True)) self.assertEqual(parse_mode("wt"), ("w", True)) self.assertEqual(parse_mode("at"), ("a", True)) for mode in "", "k", "kb", "kt": self.assertRaises(ValueError, parse_mode, mode) def suite(): return unittest.TestLoader().loadTestsFromTestCase(TestMode) if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/hdfs/test_core.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest import uuid from pydoop.hdfs.core import init hdfs = init() class TestCore(unittest.TestCase): def test_default(self): path = "/tmp/pydoop-test-{}".format(uuid.uuid4().hex) fs = f = None try: fs = hdfs.CoreHdfsFs("default", 0) f = fs.open_file(path, "w") f.write(b"bar\n") finally: if f: f.close() fs.delete(path) if fs: fs.close() def suite(): suite_ = unittest.TestSuite() suite_.addTest(TestCore('test_default')) return suite_ if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/hdfs/test_hdfs.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT from __future__ import division import unittest import tempfile import os import stat from pydoop.utils.py3compat import czip from threading import Thread import pydoop.hdfs as hdfs from pydoop.hdfs.common import BUFSIZE from pydoop.test_utils import UNI_CHR, make_random_data, FSTree class TestHDFS(unittest.TestCase): def setUp(self): wd = tempfile.mkdtemp(suffix='_%s' % UNI_CHR) wd_bn = os.path.basename(wd) self.local_wd = "file:%s" % wd fs = hdfs.hdfs("default", 0) fs.create_directory(wd_bn) self.hdfs_wd = fs.get_path_info(wd_bn)["name"] fs.close() basenames = ["test_path_%d" % i for i in range(2)] self.local_paths = ["%s/%s" % (self.local_wd, bn) for bn in basenames] self.hdfs_paths = ["%s/%s" % (self.hdfs_wd, bn) for bn in basenames] self.data = make_random_data( 4 * BUFSIZE + BUFSIZE // 2, printable=False ) for path in self.local_paths: self.assertTrue(path.startswith("file:")) for path in self.hdfs_paths: if not hdfs.default_is_local(): self.assertTrue(path.startswith("hdfs:")) def tearDown(self): fs = hdfs.hdfs("", 0) fs.delete(self.local_wd) fs.close() fs = hdfs.hdfs("default", 0) fs.delete(self.hdfs_wd) fs.close() def open(self): for test_path in self.hdfs_paths[0], self.local_paths[0]: with hdfs.open(test_path, "w") as f: f.write(self.data) f.fs.close() with hdfs.open(test_path) as f: self.assertEqual(f.read(), self.data) f.fs.close() def dump(self): for test_path in self.hdfs_paths[0], self.local_paths[0]: hdfs.dump(self.data, test_path, mode="wb") with hdfs.open(test_path) as fi: rdata = fi.read() fi.fs.close() self.assertEqual(rdata, self.data) def __ls(self, ls_func, path_transform): for wd, paths in czip( (self.local_wd, self.hdfs_wd), (self.local_paths, self.hdfs_paths) ): for p in paths: hdfs.dump(self.data, p, mode="wb") test_dir = "%s/%s" % (wd, "test_dir") test_path = "%s/%s" % (test_dir, "test_path") hdfs.dump(self.data, test_path, mode="wb") paths.append(test_dir) for recursive in False, True: if recursive: paths.append(test_path) dir_list = [ path_transform(p) for p in ls_func(wd, recursive=recursive) ] self.assertEqual(sorted(dir_list), sorted(paths)) def lsl(self): self.__ls(hdfs.lsl, lambda x: x["name"]) def ls(self): self.__ls(hdfs.ls, lambda x: x) def mkdir(self): for wd in self.local_wd, self.hdfs_wd: d1 = "%s/d1" % wd d2 = "%s/d2" % d1 hdfs.mkdir(d2) dir_list = hdfs.ls(d1) self.assertEqual(len(dir_list), 1) self.assertTrue(dir_list[0].endswith(d2)) def load(self): for test_path in self.hdfs_paths[0], self.local_paths[0]: hdfs.dump(self.data, test_path, mode="wb") rdata = hdfs.load(test_path) self.assertEqual(rdata, self.data) def __make_tree(self, wd, root="d1", create=True): """ d1 |-- d2 | `-- f2 `-- f1 """ d1 = "%s/%s" % (wd, root) t1 = FSTree(d1) d2 = "%s/d2" % d1 t2 = t1.add(d2) if create: hdfs.mkdir(d2) for t, d, bn in ((t1, d1, "f1"), (t2, d2, "f2")): f = "%s/%s" % (d, bn) if create: hdfs.dump(self.data, f, mode="wb") t.add(f, 0) return t1 def __cp_file(self, wd): fn = "%s/fn" % wd hdfs.dump(self.data, fn, mode="wb") dest_dir = "%s/dest_dir" % wd hdfs.mkdir(dest_dir) fn_copy_on_wd = "%s/fn_copy" % wd hdfs.cp(fn, fn_copy_on_wd, mode="wb") self.assertEqual(hdfs.load(fn_copy_on_wd), self.data) self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd) fn_copy_on_dest_dir = "%s/fn" % dest_dir hdfs.cp(fn, dest_dir, mode="wb") self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data) self.assertRaises(IOError, hdfs.cp, fn, dest_dir) def __cp_dir(self, wd): src_dir = "%s/src_dir" % wd hdfs.mkdir(src_dir) copy_on_wd = "%s/src_dir_copy" % wd copy_on_copy_on_wd = "%s/src_dir" % copy_on_wd hdfs.cp(src_dir, copy_on_wd, mode="wb") self.assertTrue(hdfs.path.exists(copy_on_wd)) hdfs.cp(src_dir, copy_on_wd, mode="wb") self.assertTrue(hdfs.path.exists(copy_on_copy_on_wd)) self.assertRaises(IOError, hdfs.cp, src_dir, copy_on_wd) def __cp_recursive(self, wd): src_t = self.__make_tree(wd) src = src_t.name copy_on_wd = "%s_copy" % src src_bn, copy_on_wd_bn = [ hdfs.path.basename(d) for d in (src, copy_on_wd) ] hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data) # check semantics when target dir already exists hdfs.rm(copy_on_wd) hdfs.mkdir(copy_on_wd) hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data) def cp(self): for wd in self.local_wd, self.hdfs_wd: self.__cp_file(wd) self.__cp_dir(wd) self.__cp_recursive(wd) def put(self): src = hdfs.path.split(self.local_paths[0])[-1] dest = self.hdfs_paths[0] with open(src, "wb") as f: f.write(self.data) hdfs.put(src, dest, mode="wb") with hdfs.open(dest) as fi: rdata = fi.read() self.assertEqual(rdata, self.data) def get(self): src = self.hdfs_paths[0] dest = hdfs.path.split(self.local_paths[0])[-1] hdfs.dump(self.data, src, mode="wb") hdfs.get(src, dest, mode="wb") with open(dest, 'rb') as fi: rdata = fi.read() self.assertEqual(rdata, self.data) def rm(self): for wd in self.local_wd, self.hdfs_wd: t1 = self.__make_tree(wd) hdfs.rm(t1.name) self.assertEqual(len(hdfs.ls(wd)), 0) def chmod(self): with tempfile.NamedTemporaryFile(suffix='_%s' % UNI_CHR) as f: hdfs.chmod("file://" + f.name, 444) s = os.stat(f.name) self.assertEqual(444, stat.S_IMODE(s.st_mode)) def move(self): for wd in self.local_wd, self.hdfs_wd: t1 = self.__make_tree(wd) t2 = [_ for _ in t1.children if _.kind == 1][0] f2 = t2.children[0] hdfs.move(f2.name, t1.name) ls = [os.path.basename(_) for _ in hdfs.ls(t1.name)] self.assertTrue(os.path.basename(f2.name) in ls) self.assertEqual(len(hdfs.ls(t2.name)), 0) def chown(self): new_user = 'nobody' test_path = self.hdfs_paths[0] hdfs.dump(self.data, test_path, mode="wb") hdfs.chown(test_path, user=new_user) path_info = hdfs.lsl(test_path)[0] self.assertEqual(path_info['owner'], new_user) prev_owner = path_info['owner'] prev_grp = path_info['group'] # owner and group should remain unchanged hdfs.chown(test_path, user='', group='') path_info = hdfs.lsl(test_path)[0] self.assertEqual(path_info['owner'], prev_owner) self.assertEqual(path_info['group'], prev_grp) def rename(self): test_path = self.hdfs_paths[0] new_path = "%s.new" % test_path hdfs.dump(self.data, test_path, mode="wb") hdfs.rename(test_path, new_path) self.assertFalse(hdfs.path.exists(test_path)) self.assertTrue(hdfs.path.exists(new_path)) if not hdfs.default_is_local(): self.assertRaises( RuntimeError, hdfs.rename, new_path, self.local_paths[0] ) def renames(self): test_path = self.hdfs_paths[0] hdfs.dump(self.data, test_path, mode="wb") new_d = hdfs.path.join(self.hdfs_wd, "new_dir") new_path = hdfs.path.join(new_d, "new_p") hdfs.renames(test_path, new_path) self.assertFalse(hdfs.path.exists(test_path)) self.assertTrue(hdfs.path.exists(new_path)) def capacity(self): fs = hdfs.hdfs("", 0) self.assertRaises(RuntimeError, fs.capacity) fs.close() if not hdfs.default_is_local(): fs = hdfs.hdfs("default", 0) cap = fs.capacity() self.assertGreaterEqual(cap, 0) def get_hosts(self): if hdfs.default_is_local(): # only run on HDFS return hdfs.dump(self.data, self.hdfs_paths[0], mode="wb") fs = hdfs.hdfs("default", 0) hs = fs.get_hosts(self.hdfs_paths[0], 0, 10) self.assertTrue(len(hs) > 0) self.assertRaises( ValueError, fs.get_hosts, self.hdfs_paths[0], -10, 10 ) self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10) def thread_allow(self): # test whether our code is properly allowing other python threads to # make progress while we're busy doing I/O class BusyCounter(Thread): def __init__(self): super(BusyCounter, self).__init__() self.done = False self._count = 0 @property def count(self): return self._count def run(self): while not self.done: self._count += 1 class BusyContext(object): def __init__(self): self.counter = None def __enter__(self): self.counter = BusyCounter() self.counter.start() def __exit__(self, _1, _2, _3): self.counter.done = True self.counter.join() @property def count(self): return self.counter.count some_data = b"a" * (5 * 1024 * 1024) # 5 MB counter = BusyContext() ########################### acceptable_threshold = 5 # The tests were sometimes failing on TravisCI (slower machines) with # counts below 100. A test where we left the GIL locked showed that in # that case counter value doesn't change at all across calls, so in # theory even an increment of 1 would demonstrate that the mechanism is # working. # If the hdfs call doesn't release the GIL, the counter won't make any # progress during the HDFS call and will be stuck at 0. On the other # hand, if the GIL is release during the operation we'll see a count # value > 0. fs = hdfs.hdfs("default", 0) with fs.open_file(self.hdfs_paths[0], "w") as f: with counter: f.write(some_data) self.assertGreaterEqual(counter.count, acceptable_threshold) with fs.open_file(self.hdfs_paths[0], "r") as f: with counter: f.read() self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: fs.get_hosts(self.hdfs_paths[0], 0, 10) self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: fs.list_directory('/') self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: hdfs.cp(self.hdfs_paths[0], self.hdfs_paths[0] + '_2', mode="wb") self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: hdfs.rm(self.hdfs_paths[0] + '_2') self.assertGreaterEqual(counter.count, acceptable_threshold) # ...we could go on, but the better strategy would be to insert a check # analogous to these in each method's unit test def suite(): suite_ = unittest.TestSuite() suite_.addTest(TestHDFS("open")) suite_.addTest(TestHDFS("dump")) suite_.addTest(TestHDFS("lsl")) suite_.addTest(TestHDFS("ls")) suite_.addTest(TestHDFS("mkdir")) suite_.addTest(TestHDFS("load")) suite_.addTest(TestHDFS("cp")) suite_.addTest(TestHDFS("put")) suite_.addTest(TestHDFS("get")) suite_.addTest(TestHDFS("rm")) suite_.addTest(TestHDFS("chmod")) suite_.addTest(TestHDFS("move")) suite_.addTest(TestHDFS("chown")) suite_.addTest(TestHDFS("rename")) suite_.addTest(TestHDFS("renames")) suite_.addTest(TestHDFS("capacity")) suite_.addTest(TestHDFS("get_hosts")) # randomly fails on Travis # suite_.addTest(TestHDFS("thread_allow")) return suite_ if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/hdfs/test_hdfs_fs.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest import getpass import os import socket from itertools import product import pydoop.hdfs as hdfs from common_hdfs_tests import TestCommon, common_tests import pydoop.test_utils as u from pydoop.utils.py3compat import clong CURRENT_USER = getpass.getuser() DEFAULT_FS = hdfs.fs._default_fs() def get_explicit_hp(): hp = DEFAULT_FS.netloc.split(":") if len(hp) < 2: hp.append(u._DEFAULT_HDFS_PORT) return os.getenv("HDFS_HOST", hp[0]), int(os.getenv("HDFS_PORT", hp[1])) class TestConnection(unittest.TestCase): def setUp(self): self.hp_cases = [("default", 0)] self.u_cases = [None, CURRENT_USER] if DEFAULT_FS.scheme == "hdfs": hdfs_host, hdfs_port = get_explicit_hp() self.hp_cases.append((hdfs_host, hdfs_port)) self.u_cases.append("nobody") try: hdfs_ip = socket.gethostbyname(hdfs_host) except socket.gaierror: pass else: self.hp_cases.append((hdfs_ip, hdfs_port)) def connect(self): for host, port in self.hp_cases: for user in self.u_cases: expected_user = user or CURRENT_USER with hdfs.hdfs(host, port, user=user) as fs: self.assertEqual(fs.user, expected_user) def cache(self): for (h1, p1), (h2, p2) in product(self.hp_cases, repeat=2): hdfs.hdfs._CACHE.clear() hdfs.hdfs._ALIASES = {"host": {}, "port": {}, "user": {}} # FIXME with hdfs.hdfs(h1, p1) as fs1: with hdfs.hdfs(h2, p2) as fs2: print(' * %r vs %r' % ((h1, p1), (h2, p2))) self.assertTrue(fs2.fs is fs1.fs) for fs in fs1, fs2: self.assertFalse(fs.closed) for fs in fs1, fs2: self.assertTrue(fs.closed) class TestHDFS(TestCommon): def __init__(self, target): TestCommon.__init__(self, target, 'default', 0) def capacity(self): c = self.fs.capacity() self.assertTrue(isinstance(c, (int, clong))) def default_block_size(self): dbs = self.fs.default_block_size() self.assertTrue(isinstance(dbs, (int, clong))) def used(self): u_ = self.fs.used() self.assertTrue(isinstance(u_, (int, clong))) def chown(self): new_owner = "nobody" new_group = "users" path = self._make_random_file() old_owner = self.fs.get_path_info(path)["owner"] old_group = self.fs.get_path_info(path)["group"] self.fs.chown(path, user=new_owner) self.assertEqual(self.fs.get_path_info(path)["owner"], new_owner) self.assertEqual(self.fs.get_path_info(path)["group"], old_group) self.fs.chown(path, group=new_group) self.assertEqual(self.fs.get_path_info(path)["owner"], new_owner) self.assertEqual(self.fs.get_path_info(path)["group"], new_group) self.fs.chown(path, old_owner, old_group) self.assertEqual(self.fs.get_path_info(path)["owner"], old_owner) self.assertEqual(self.fs.get_path_info(path)["group"], old_group) def utime(self): path = self._make_random_file() old_mtime = self.fs.get_path_info(path)["last_mod"] old_atime = self.fs.get_path_info(path)["last_access"] new_mtime = old_mtime - 500 new_atime = old_mtime - 100 self.fs.utime(path, new_mtime, new_atime) self.assertEqual( self.fs.get_path_info(path)["last_mod"], int(new_mtime) ) self.assertEqual( self.fs.get_path_info(path)["last_access"], int(new_atime) ) self.fs.utime(path, old_mtime, old_atime) self.assertEqual( self.fs.get_path_info(path)["last_mod"], int(old_mtime) ) self.assertEqual( self.fs.get_path_info(path)["last_access"], int(old_atime) ) def block_size(self): for bs_MB in range(100, 500, 50): bs = bs_MB * 2**20 path = self._make_random_file(blocksize=bs) self.assertEqual(self.fs.get_path_info(path)["block_size"], bs) def replication(self): for r in range(1, 6): path = self._make_random_file(replication=r) self.assertEqual(self.fs.get_path_info(path)["replication"], r) def set_replication(self): old_r, new_r = 2, 3 path = self._make_random_file(replication=old_r) self.fs.set_replication(path, new_r) self.assertEqual(self.fs.get_path_info(path)["replication"], new_r) # HDFS returns less than the number of requested bytes if the chunk # being read crosses the boundary between data blocks. def readline_block_boundary(self): def _write_prefix(f, size, bs): # Avoid memory problem with JVM chunk_size = min(bs, 12 * 1048576) written = 0 while written < size: data = b'X' * min(chunk_size, size - written) written += f.write(data) # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576 bs = 1048576 line = b"012345678\n" offset = bs - (10 * len(line) + 5) path = self._make_random_path() with self.fs.open_file(path, mode="w", blocksize=bs) as f: bytes_written = lines_written = 0 _write_prefix(f, offset, bs) bytes_written = offset while bytes_written < bs + 1: f.write(line) lines_written += 1 bytes_written += len(line) with self.fs.open_file(path) as f: f.seek(offset) lines = [] while 1: L = f.readline() if not L: break lines.append(L) self.assertEqual(len(lines), lines_written) for i, L in enumerate(lines): self.assertEqual(L, line, "line %d: %r != %r" % (i, L, line)) def get_hosts(self): # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576 blocksize = 1048576 N = 4 content = b"x" * blocksize * N path = self._make_random_file(content=content, blocksize=blocksize) start = 0 for i in range(N): length = blocksize * i + 1 hosts_per_block = self.fs.get_hosts(path, start, length) self.assertEqual(len(hosts_per_block), i + 1) def suite(): suite_ = unittest.TestSuite() suite_.addTest(TestConnection('connect')) suite_.addTest(TestConnection('cache')) tests = common_tests() if DEFAULT_FS.scheme == "hdfs": tests.extend([ 'capacity', 'default_block_size', 'used', 'chown', 'utime', 'block_size', 'replication', 'set_replication', 'readline_block_boundary', 'get_hosts', ]) for t in tests: suite_.addTest(TestHDFS(t)) return suite_ if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/hdfs/test_local_fs.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest import getpass import tempfile import os import pydoop.hdfs as hdfs from common_hdfs_tests import TestCommon, common_tests class TestConnection(unittest.TestCase): def runTest(self): current_user = getpass.getuser() cwd = os.getcwd() os.chdir(tempfile.gettempdir()) for user in None, current_user, "nobody": expected_user = current_user fs = hdfs.hdfs("", 0, user=user) self.assertEqual(fs.user, expected_user) fs.close() os.chdir(cwd) class TestLocalFS(TestCommon): def __init__(self, target): TestCommon.__init__(self, target, '', 0) def suite(): suite_ = unittest.TestSuite() suite_.addTest(TestConnection('runTest')) tests = common_tests() for t in tests: suite_.addTest(TestLocalFS(t)) return suite_ if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/hdfs/test_path.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import os import unittest import tempfile from numbers import Number import pydoop.hdfs as hdfs from pydoop.hdfs.common import DEFAULT_PORT, DEFAULT_USER from pydoop.utils.misc import make_random_str from pydoop.test_utils import UNI_CHR def uni_last(tup): return tup[:-1] + (tup[-1] + UNI_CHR,) class TestSplit(unittest.TestCase): def good(self): cases = [ ('hdfs://localhost:9000/', ('localhost', 9000, '/')), ('hdfs://localhost:9000/a/b', ('localhost', 9000, '/a/b')), ('hdfs://localhost/a/b', ('localhost', DEFAULT_PORT, '/a/b')), ('hdfs:///a/b', ('default', 0, '/a/b')), ('hdfs:/', ('default', 0, '/')), ('file:///a/b', ('', 0, '/a/b')), ('file:/a/b', ('', 0, '/a/b')), ('file:///a', ('', 0, '/a')), ('file:/a', ('', 0, '/a')), ('file://temp/foo.txt', ('', 0, 'temp/foo.txt')), ('file://temp', ('', 0, 'temp')), ] if hdfs.default_is_local(): cases.extend([ ('///a/b', ('', 0, '/a/b')), ('/a/b', ('', 0, '/a/b')), ('a/b', ('', 0, 'a/b')), ]) else: cases.extend([ ('///a/b', ('default', 0, '/a/b')), ('/a/b', ('default', 0, '/a/b')), ('a/b', ('default', 0, '/user/%s/a/b' % DEFAULT_USER)), ]) for p, r in cases: self.assertEqual(hdfs.path.split(p), r) for p, r in cases[1:]: self.assertEqual(hdfs.path.split(p + UNI_CHR), uni_last(r)) def good_with_user(self): if hdfs.default_is_local(): cases = [ ('a/b', u, ('', 0, 'a/b')) for u in [None, DEFAULT_USER, 'foo'] ] else: cases = [ ('a/b', None, ('default', 0, '/user/%s/a/b' % DEFAULT_USER)), ('a/b', DEFAULT_USER, ( 'default', 0, '/user/%s/a/b' % DEFAULT_USER )), ('a/b', 'foo', ('default', 0, '/user/foo/a/b')), ] for p, u, r in cases: self.assertEqual(hdfs.path.split(p, u), r) self.assertEqual(hdfs.path.split(p + UNI_CHR, u), uni_last(r)) def bad(self): cases = [ '', # not allowed in the Java API 'hdfs:', # no scheme-specific part 'hdfs://', # path part is empty 'ftp://localhost:9000/', # bad scheme 'hdfs://localhost:spam/', # port is not an int 'hdfs://localhost:9000', # path part is empty 'hdfs://localhost:9000/a:b', # colon outside netloc '//localhost:9000/a/b', # null scheme ] if not hdfs.default_is_local(): cases.append('/localhost:9000/a/b') # colon outside netloc for p in cases: self.assertRaises(ValueError, hdfs.path.split, p) def splitext(self): for pre in '', 'file:', 'hdfs://host:1': name, ext = '%sfoo' % pre, '.txt' self.assertEqual(hdfs.path.splitext(name + ext), (name, ext)) p = 'hdfs://foo.com:1/' self.assertEqual(hdfs.path.splitext(p), (p, '')) class TestUnparse(unittest.TestCase): def good(self): cases = [ (('hdfs', 'host:1', '/'), 'hdfs://host:1/'), (('file', '', '/'), 'file:/'), (('hdfs', 'host:1', UNI_CHR), 'hdfs://host:1/%s' % UNI_CHR), (('file', '', UNI_CHR), 'file:/%s' % UNI_CHR), (('', '', UNI_CHR), UNI_CHR), ] for (scheme, netloc, path), exp_uri in cases: self.assertEqual(hdfs.path.unparse(scheme, netloc, path), exp_uri) def bad(self): self.assertRaises(ValueError, hdfs.path.unparse, '', 'host:1', '/a') class TestJoin(unittest.TestCase): def __check_join(self, cases): for p, r in cases: self.assertEqual(hdfs.path.join(*p), r) def simple(self): self.__check_join([ (('foo', 'bar', 'tar'), 'foo/bar/tar'), (('/foo', 'bar', 'tar'), '/foo/bar/tar'), ]) def slashes(self): self.__check_join([ (('foo/', 'bar/', 'tar'), 'foo/bar/tar'), (('/foo/', 'bar/', 'tar'), '/foo/bar/tar'), ]) def absolute(self): self.__check_join([ (('foo', '/bar', 'tar'), '/bar/tar'), (('foo', 'hdfs://host:1/bar', 'tar'), 'hdfs://host:1/bar/tar'), (('foo', 'file:/bar', 'tar'), 'file:/bar/tar'), (('foo', 'file:///bar', 'tar'), 'file:///bar/tar'), ]) def full(self): self.__check_join([ (('hdfs://host:1/', '/foo'), 'hdfs://host:1/foo'), (('hdfs://host:1/', 'file:/foo', '/bar'), 'file:/foo/bar'), (('foo', '/bar', 'hdfs://host:1/tar'), 'hdfs://host:1/tar'), ]) def unicode_(self): self.__check_join( [(('/foo', 'bar', UNI_CHR), '/foo/bar/%s' % UNI_CHR)] ) class TestAbspath(unittest.TestCase): def setUp(self): if hdfs.default_is_local(): self.root = "file:" else: fs = hdfs.hdfs("default", 0) self.root = "hdfs://%s:%s" % (fs.host, fs.port) fs.close() self.p = 'a/%s' % UNI_CHR def without_user(self): abs_p = hdfs.path.abspath(self.p, user=None, local=False) if hdfs.default_is_local(): self.assertEqual( abs_p, '%s%s' % (self.root, os.path.abspath(self.p)) ) else: self.assertEqual( abs_p, '%s/user/%s/%s' % (self.root, DEFAULT_USER, self.p) ) def with_user(self): abs_p = hdfs.path.abspath(self.p, user="pydoop", local=False) if hdfs.default_is_local(): self.assertEqual( abs_p, '%s%s' % (self.root, os.path.abspath(self.p)) ) else: self.assertEqual(abs_p, '%s/user/pydoop/%s' % (self.root, self.p)) def forced_local(self): for user in None, "pydoop": abs_p = hdfs.path.abspath(self.p, user=user, local=True) self.assertEqual(abs_p, 'file:%s' % os.path.abspath(self.p)) def already_absolute(self): for p in ('file:/a/%s' % UNI_CHR, 'hdfs://localhost:9000/a/%s' % UNI_CHR): for user in None, "pydoop": abs_p = hdfs.path.abspath(p, user=user, local=False) self.assertEqual(abs_p, p) abs_p = hdfs.path.abspath(p, user=user, local=True) self.assertEqual(abs_p, 'file:%s' % os.path.abspath(p)) class TestSplitBasenameDirname(unittest.TestCase): def runTest(self): cases = [ # path, expected dirname, expected basename ("hdfs://host:1/a/%s" % UNI_CHR, "hdfs://host:1/a", UNI_CHR), ("hdfs://host:1/", "hdfs://host:1/", ""), ("hdfs:/", "hdfs:/", ""), ("file:/", "file:/", ""), ("a/%s" % UNI_CHR, "a", UNI_CHR), ("/a/%s" % UNI_CHR, "/a", UNI_CHR), (UNI_CHR, "", UNI_CHR), ('/%s' % UNI_CHR, "/", UNI_CHR), ('', '', ''), ] for p, d, bn in cases: self.assertEqual(hdfs.path.dirname(p), d) self.assertEqual(hdfs.path.basename(p), bn) self.assertEqual(hdfs.path.splitpath(p), (d, bn)) class TestExists(unittest.TestCase): def good(self): base_path = make_random_str() for path in base_path, base_path + UNI_CHR: hdfs.dump("foo\n", path) self.assertTrue(hdfs.path.exists(path)) hdfs.rm(path) self.assertFalse(hdfs.path.exists(path)) class TestKind(unittest.TestCase): def setUp(self): self.path = make_random_str() self.u_path = self.path + UNI_CHR def test_kind(self): for path in self.path, self.u_path: self.assertTrue(hdfs.path.kind(path) is None) try: hdfs.dump("foo\n", path) self.assertEqual('file', hdfs.path.kind(path)) hdfs.rm(path) hdfs.mkdir(path) self.assertEqual('directory', hdfs.path.kind(path)) finally: try: hdfs.rm(path) except IOError: pass def test_isfile(self): for path in self.path, self.u_path: self.assertFalse(hdfs.path.isfile(path)) try: hdfs.dump("foo\n", path) self.assertTrue(hdfs.path.isfile(path)) hdfs.rm(path) hdfs.mkdir(path) self.assertFalse(hdfs.path.isfile(path)) finally: try: hdfs.rm(path) except IOError: pass def test_isdir(self): for path in self.path, self.u_path: self.assertFalse(hdfs.path.isdir(path)) try: hdfs.dump("foo\n", path) self.assertFalse(hdfs.path.isdir(path)) hdfs.rm(path) hdfs.mkdir(path) self.assertTrue(hdfs.path.isdir(path)) finally: try: hdfs.rm(path) except IOError: pass class TestExpand(unittest.TestCase): def expanduser(self): for pre in '~', '~%s' % DEFAULT_USER: for rest in '', '/d': p = '%s%s' % (pre, rest) if hdfs.default_is_local(): self.assertEqual( hdfs.path.expanduser(p), os.path.expanduser(p) ) else: exp_res = '/user/%s%s' % (DEFAULT_USER, rest) self.assertEqual(hdfs.path.expanduser(p), exp_res) def expanduser_no_expansion(self): for pre in ('hdfs://host:1', 'file://', ''): for rest in ('/~', '/~foo', '/d/~', '/d/~foo'): p = '%s%s' % (pre, rest) self.assertEqual(hdfs.path.expanduser(p), p) def expandvars(self): k, v = 'PYDOOP_TEST_K', 'PYDOOP_TEST_V' p = 'hdfs://host:1/${%s}' % k os.environ[k] = v exp_res = '%s/%s' % (p.rsplit('/', 1)[0], v) try: self.assertEqual(hdfs.path.expandvars(p), exp_res) finally: del os.environ[k] class TestStat(unittest.TestCase): NMAP = { 'st_mode': 'permissions', 'st_uid': 'owner', 'st_gid': 'group', 'st_size': 'size', 'st_atime': 'last_access', 'st_mtime': 'last_mod', 'st_blksize': 'block_size', } def stat(self): if hdfs.default_is_local(): return bn = '%s%s' % (make_random_str(), UNI_CHR) fn = '/user/%s/%s' % (DEFAULT_USER, bn) fs = hdfs.hdfs("default", 0) p = "hdfs://%s:%s%s" % (fs.host, fs.port, fn) with fs.open_file(fn, 'wt') as fo: fo.write(make_random_str()) info = fs.get_path_info(fn) fs.close() s = hdfs.path.stat(p) for n1, n2 in self.NMAP.items(): attr = getattr(s, n1, None) self.assertFalse(attr is None) self.assertEqual(attr, info[n2]) self.__check_extra_args(s, info) self.__check_wrapper_funcs(p) hdfs.rm(p) def stat_on_local(self): wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR) p_ = os.path.join(wd_, make_random_str()) if hdfs.default_is_local(): wd, p = wd_, p_ host = "default" else: wd, p = ('file:%s' % _ for _ in (wd_, p_)) host = "" fs = hdfs.hdfs(host, 0) with fs.open_file(p_, 'w') as fo: fo.write(b"foobar\n") info = fs.get_path_info(p_) fs.close() s = hdfs.path.stat(p) os_s = os.stat(p_) for n in dir(s): if n.startswith('st_'): try: exp_v = getattr(os_s, n) except AttributeError: try: exp_v = info[self.NMAP[n]] except KeyError: continue self.assertEqual(getattr(s, n), exp_v) self.__check_extra_args(s, info) self.__check_wrapper_funcs(p) hdfs.rm(wd) def stat_on_dir(self): if hdfs.default_is_local(): wd = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR) else: wd = make_random_str() + UNI_CHR hdfs.mkdir(wd) s = hdfs.path.stat(wd) if hdfs.default_is_local(): os_s = os.stat(wd) for name in 'st_size', 'st_blksize', 'st_blocks': self.assertEqual(getattr(s, name), getattr(os_s, name)) else: for attr in s.st_size, s.st_blksize, s.st_blocks: self.assertEqual(attr, 0) hdfs.rm(wd) def __check_extra_args(self, stat_res, path_info): for n in 'kind', 'name', 'replication': attr = getattr(stat_res, '%s' % n, None) self.assertFalse(attr is None) self.assertEqual(attr, path_info[n]) def __check_wrapper_funcs(self, path): for n in 'getatime', 'getmtime', 'getctime', 'getsize': func = getattr(hdfs.path, n) self.assertTrue(isinstance(func(path), Number)) class TestIsSomething(unittest.TestCase): def full_and_abs(self): for name in 'isfull', 'isabs': test = getattr(hdfs.path, name) for p in 'hdfs://host:1/foo', 'file:/foo': self.assertTrue(test(p)) self.assertFalse(test('foo')) self.assertFalse(hdfs.path.isfull('/foo')) self.assertTrue(hdfs.path.isabs('/foo')) def islink(self): wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR) wd = 'file:%s' % wd_ self.assertFalse(hdfs.path.islink(wd)) link = os.path.join(wd_, make_random_str()) os.symlink(wd_, link) self.assertTrue(hdfs.path.islink('file:%s' % link)) hdfs.rm(wd) def ismount(self): self.assertFalse(hdfs.path.ismount('hdfs://host:1/foo')) class TestNorm(unittest.TestCase): def normpath(self): for pre in '', 'file:', 'hdfs://host:1': post = '/a/./b/c/../../foo' npost = '/a/foo' self.assertEqual(hdfs.path.normpath(pre + post), pre + npost) self.assertEqual(hdfs.path.normpath('a/./b/c/../../foo'), 'a/foo') class TestReal(unittest.TestCase): def realpath(self): wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR) wd = 'file:%s' % wd_ link = os.path.join(wd_, make_random_str()) os.symlink(wd_, link) expected_path = 'file:%s' % os.path.realpath(wd_) self.assertEqual(hdfs.path.realpath('file:%s' % link), expected_path) hdfs.rm(wd) class TestSame(unittest.TestCase): def samefile_link(self): wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR) wd = 'file:%s' % wd_ link = os.path.join(wd_, make_random_str()) os.symlink(wd_, link) self.assertTrue(hdfs.path.samefile('file:%s' % link, 'file:%s' % wd_)) hdfs.rm(wd) def samefile_rel(self): p = make_random_str() + UNI_CHR hdfs.dump("foo\n", p) self.assertTrue(hdfs.path.samefile(p, hdfs.path.abspath(p))) hdfs.rm(p) def samefile_norm(self): for pre in '', 'file:/', 'hdfs://host:1/': self.assertTrue(hdfs.path.samefile(pre + 'a/b/../c', pre + 'a/c')) def samefile_user(self): if not hdfs.default_is_local(): self.assertTrue(hdfs.path.samefile('fn', '/user/u/fn', user='u')) class TestAccess(unittest.TestCase): def setUp(self): self.path = make_random_str() + UNI_CHR hdfs.mkdir(self.path) def tearDown(self): hdfs.rm(self.path) # FIXME: far from exhaustive. This is a slow test def __test(self, offset, user=None): for mode in os.R_OK, os.W_OK, os.X_OK: hdfs.chmod(self.path, mode << offset) print(' * mode now: %03o' % hdfs.path.stat(self.path).st_mode) self.assertTrue(hdfs.path.access(self.path, mode, user=user)) def test_owner(self): self.__test(6) def test_other(self): self.__test(0, user=make_random_str()) class TestUtime(unittest.TestCase): def runTest(self): path = make_random_str() + UNI_CHR hdfs.dump("foo\n", path) st = hdfs.path.stat(path) atime, mtime = [getattr(st, 'st_%stime' % _) for _ in 'am'] new_atime, new_mtime = atime + 100, mtime + 200 hdfs.path.utime(path, (new_atime, new_mtime)) st = hdfs.path.stat(path) self.assertEqual(int(st.st_atime), int(new_atime)) self.assertEqual(int(st.st_mtime), int(new_mtime)) hdfs.rm(path) class TestCallFromHdfs(unittest.TestCase): def setUp(self): self.path = make_random_str() + UNI_CHR hdfs.dump("foo\n", self.path) def tearDown(self): hdfs.rm(self.path) def test_stat(self): for name in 'stat', 'lstat': self.assertTrue(hasattr(hdfs, name)) func = getattr(hdfs, name) func(self.path) func(self.path, user=DEFAULT_USER) def test_access(self): self.assertTrue(hasattr(hdfs, 'access')) hdfs.access(self.path, os.F_OK) hdfs.access(self.path, os.F_OK, user=DEFAULT_USER) def test_utime(self): self.assertTrue(hasattr(hdfs, 'utime')) hdfs.utime(self.path) hdfs.utime(self.path, times=(1e9, 1e9)) hdfs.utime(self.path, times=(1e9, 1e9), user=DEFAULT_USER) def suite(): suite_ = unittest.TestSuite() suite_.addTest(TestSplit('good')) suite_.addTest(TestSplit('good_with_user')) suite_.addTest(TestSplit('bad')) suite_.addTest(TestSplit('splitext')) suite_.addTest(TestUnparse('good')) suite_.addTest(TestUnparse('bad')) suite_.addTest(TestJoin('simple')) suite_.addTest(TestJoin('slashes')) suite_.addTest(TestJoin('absolute')) suite_.addTest(TestJoin('full')) suite_.addTest(TestJoin('unicode_')) suite_.addTest(TestAbspath('with_user')) suite_.addTest(TestAbspath('without_user')) suite_.addTest(TestAbspath('forced_local')) suite_.addTest(TestAbspath('already_absolute')) suite_.addTest(TestSplitBasenameDirname('runTest')) suite_.addTest(TestExists('good')) suite_.addTest(TestExpand('expanduser')) suite_.addTest(TestExpand('expanduser_no_expansion')) suite_.addTest(TestExpand('expandvars')) suite_.addTest(TestStat('stat')) suite_.addTest(TestStat('stat_on_local')) suite_.addTest(TestStat('stat_on_dir')) suite_.addTest(TestIsSomething('full_and_abs')) suite_.addTest(TestIsSomething('islink')) suite_.addTest(TestIsSomething('ismount')) suite_.addTest(TestNorm('normpath')) suite_.addTest(TestReal('realpath')) suite_.addTest(TestSame('samefile_link')) suite_.addTest(TestSame('samefile_rel')) suite_.addTest(TestSame('samefile_norm')) suite_.addTest(TestSame('samefile_user')) suite_.addTest(TestAccess('test_owner')) suite_.addTest(TestAccess('test_other')) suite_.addTest(TestUtime('runTest')) suite_.addTest(TestCallFromHdfs('test_stat')) suite_.addTest(TestCallFromHdfs('test_access')) suite_.addTest(TestCallFromHdfs('test_utime')) suite_.addTest(unittest.TestLoader().loadTestsFromTestCase(TestKind)) return suite_ if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/hdfs/try_hdfs.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ Check that resetting the hdfs module after changing os.environ['HADOOP_CONF_DIR'] works (i.e., Pydoop references the correct HDFS service). Note that it does **NOT** work if you've already instantiated an hdfs handle, and this is NOT due to the caching system. """ from __future__ import print_function import sys import os import argparse import pydoop.hdfs as hdfs def dump_status(fs): print("(host, port, user) = %r" % ((fs.host, fs.port, fs.user),)) print("_CACHE = %r" % (fs._CACHE,)) print("_ALIASES = %r" % (fs._ALIASES,)) print() def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--conf-dir", metavar="HADOOP_CONF_DIR") args = parser.parse_args(argv) if args.conf_dir: os.environ["HADOOP_CONF_DIR"] = os.path.abspath(args.conf_dir) hdfs.reset() fs = hdfs.hdfs() print("--- OPEN ---") dump_status(fs) print("cwd:", fs.working_directory()) print fs.close() print("--- CLOSED ---") dump_status(fs) if __name__ == "__main__": main() ================================================ FILE: test/mapreduce/__init__.py ================================================ ================================================ FILE: test/mapreduce/all_tests.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest from pydoop.test_utils import get_module TEST_MODULE_NAMES = [ 'test_connections', 'test_opaque', ] def suite(path=None): suites = [] for module in TEST_MODULE_NAMES: suites.append(get_module(module, path).suite()) return unittest.TestSuite(suites) if __name__ == '__main__': import sys _RESULT = unittest.TextTestRunner(verbosity=2).run(suite()) sys.exit(not _RESULT.wasSuccessful()) ================================================ FILE: test/mapreduce/it/crs4/pydoop/mapreduce/pipes/OpaqueRoundtrip.java ================================================ /* BEGIN_COPYRIGHT * * Copyright 2009-2026 CRS4. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * END_COPYRIGHT */ package it.crs4.pydoop.mapreduce.pipes; import java.io.IOException; import java.util.List; import java.util.Properties; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobID; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.TaskID; import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.conf.Configuration; /** * Use PipesNonJavaInputFormat.getSplits to read opaque splits from inUri, * then write them out to outUri. */ public class OpaqueRoundtrip { public static void main(String[] args) throws IOException, InterruptedException { final String inUri = args[0]; final String outUri = args[1]; JobID jobId = new JobID("201408272347", 0); TaskID taskId = new TaskID(jobId, TaskType.MAP, 0); TaskAttemptID taID = new TaskAttemptID(taskId, 0); Job job = Job.getInstance(new Configuration()); job.setJobID(jobId); Properties props = Submitter.getPydoopProperties(); Configuration conf = job.getConfiguration(); conf.set(props.getProperty("PIPES_EXTERNALSPLITS_URI"), inUri); TaskAttemptContextImpl ctx = new TaskAttemptContextImpl(conf, taID); PipesNonJavaInputFormat iformat = new PipesNonJavaInputFormat(); List splits = iformat.getSplits(ctx); Path path = new Path(outUri); FileSystem fs = FileSystem.get(conf); IntWritable numRecords = new IntWritable(splits.size()); FSDataOutputStream out = fs.create(path); try { numRecords.write(out); for(int i = 0; i < numRecords.get(); i++) { ((OpaqueSplit)splits.get(i)).write(out); } } finally { out.close(); } fs.close(); } } ================================================ FILE: test/mapreduce/test_connections.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import io import os import unittest import pydoop.mapreduce.api as api import pydoop.mapreduce.binary_protocol as bp import pydoop.mapreduce.pipes as pipes import pydoop.sercore as sercore from pydoop.test_utils import WDTestCase THIS_DIR = os.path.dirname(os.path.abspath(__file__)) M_NAME, R_NAME = "m_task.cmd", "r_task.cmd" class Mapper(api.Mapper): def map(self, context): context.emit(context.key, context.value) class Reducer(api.Reducer): def reduce(self, context): context.emit(context.key, sum(context.values)) # move to test_utils? class UplinkDumpReader(object): def __init__(self, stream): self.stream = stream def close(self): self.stream.close() def __next__(self): cmd = self.stream.read_vint() if cmd == bp.AUTHENTICATION_RESP: return cmd, self.stream.read_tuple("b") elif cmd == bp.OUTPUT: return cmd, self.stream.read_tuple("bb") elif cmd == bp.PARTITIONED_OUTPUT: return cmd, self.stream.write_tuple("ibb") elif cmd == bp.STATUS: return cmd, self.stream.read_tuple("s") elif cmd == bp.PROGRESS: return cmd, self.stream.read_tuple("f") elif cmd == bp.REGISTER_COUNTER: return cmd, self.stream.read_tuple("iss") elif cmd == bp.INCREMENT_COUNTER: return cmd, self.stream.read_tuple("il") elif cmd == bp.DONE: raise StopIteration else: raise RuntimeError("unknown command: %d" % cmd) def __iter__(self): return self # py2 compat def next(self): return self.__next__() class TestFileConnection(WDTestCase): def test_map(self): factory = pipes.Factory(Mapper) self.__run_test(M_NAME, factory, private_encoding=False) def test_reduce(self): factory = pipes.Factory(Mapper, reducer_class=Reducer) self.__run_test(R_NAME, factory) def __run_test(self, name, factory, **kwargs): orig_path = os.path.join(THIS_DIR, name) cmd_path = os.path.join(self.wd, name) with io.open(orig_path, "rb") as fi, io.open(cmd_path, "wb") as fo: fo.write(fi.read()) os.environ["mapreduce.pipes.commandfile"] = cmd_path pipes.run_task(factory, **kwargs) out_cmd_path = "%s.out" % cmd_path self.assertTrue(os.path.exists(out_cmd_path)) with sercore.FileInStream(out_cmd_path) as stream: out_cmds = set(cmd for cmd, _ in UplinkDumpReader(stream)) self.assertEqual(out_cmds, {bp.OUTPUT, bp.PROGRESS}) def suite(): suite_ = unittest.TestSuite() suite_.addTest(TestFileConnection('test_map')) suite_.addTest(TestFileConnection('test_reduce')) return suite_ if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/mapreduce/test_opaque.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest import os import shutil import uuid import pydoop from pydoop.hdfs import hdfs from pydoop.mapreduce.pipes import ( OpaqueSplit, write_opaque_splits, read_opaque_splits ) import pydoop.test_utils as utils _JAVA_SRC_ROOT = 'it' _OPAQUE_ROUNDTRIP_CLASS = 'it.crs4.pydoop.mapreduce.pipes.OpaqueRoundtrip' _OPAQUE_ROUNDTRIP_SRC = 'it/crs4/pydoop/mapreduce/pipes/OpaqueRoundtrip.java' class TestOpaqueSplit(unittest.TestCase): def setUp(self): self.fs = hdfs() self.wd = utils.make_wd(self.fs) def tearDown(self): self.fs.delete(self.wd) self.fs.close() def _make_random_path(self, where=None): return "%s/%s_%s" % (where or self.wd, uuid.uuid4().hex, utils.UNI_CHR) def _generate_opaque_splits(self, n): return [OpaqueSplit('{}_payload'.format(_)) for _ in range(n)] def _test_opaque(self, o, no): self.assertEqual(o.payload, no.payload) def _test_opaques(self, opaques, nopaques): self.assertEqual(len(opaques), len(nopaques)) for o, no in zip(opaques, nopaques): self._test_opaque(o, no) def _run_java(self, in_uri, out_uri, wd): this_directory = os.path.abspath(os.path.dirname(__file__)) shutil.copytree(os.path.join(this_directory, _JAVA_SRC_ROOT), os.path.join(wd, _JAVA_SRC_ROOT)) classpath = '.:%s:%s:%s' % ( wd, pydoop.jar_path(), pydoop.hadoop_classpath()) src = os.path.join(wd, _OPAQUE_ROUNDTRIP_SRC) utils.compile_java(src, classpath) utils.run_java( _OPAQUE_ROUNDTRIP_CLASS, classpath, [in_uri, out_uri], wd) def _do_java_roundtrip(self, splits, wd='/tmp'): in_uri = self._make_random_path() out_uri = self._make_random_path() with self.fs.open_file(in_uri, 'wb') as f: write_opaque_splits(splits, f) self._run_java(in_uri, out_uri, wd) with self.fs.open_file(out_uri, 'rb') as f: nsplits = read_opaque_splits(f) return nsplits def test_opaque(self): payload = {'a': 33, 'b': "333"} o = OpaqueSplit(payload) self.assertEqual(payload, o.payload) fname = self._make_random_path('/tmp') with open(fname, 'wb') as f: o.write(f) with open(fname, 'rb') as f: no = OpaqueSplit.read(f) self._test_opaque(o, no) os.unlink(fname) def test_write_read_opaque_splits(self): n = 10 opaques = self._generate_opaque_splits(n) fname = self._make_random_path('/tmp') with open(fname, 'wb') as f: write_opaque_splits(opaques, f) with open(fname, 'rb') as f: nopaques = read_opaque_splits(f) self._test_opaques(opaques, nopaques) os.unlink(fname) def test_opaque_java_round_trip(self): n = 10 splits = self._generate_opaque_splits(n) dname = self._make_random_path('/tmp') os.mkdir(dname) nsplits = self._do_java_roundtrip(splits, wd=dname) shutil.rmtree(dname) self._test_opaques(splits, nsplits) def suite(): return unittest.TestLoader().loadTestsFromTestCase(TestOpaqueSplit) if __name__ == '__main__': _RUNNER = unittest.TextTestRunner(verbosity=2) _RUNNER.run((suite())) ================================================ FILE: test/sercore/all_tests.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import unittest from pydoop.test_utils import get_module TEST_MODULE_NAMES = [ 'test_deser', 'test_streams', ] def suite(path=None): suites = [] for module in TEST_MODULE_NAMES: suites.append(get_module(module, path).suite()) return unittest.TestSuite(suites) if __name__ == '__main__': import sys _RESULT = unittest.TextTestRunner(verbosity=2).run(suite()) sys.exit(not _RESULT.wasSuccessful()) ================================================ FILE: test/sercore/test_deser.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import os import shutil import struct import tempfile import unittest import pydoop.sercore as sercore class TestFileSplit(unittest.TestCase): def setUp(self): work_dir = tempfile.mkdtemp(prefix="pydoop_") work_path = os.path.join(work_dir, "foo") self.filename, self.offset, self.length = "foobar", 0, 100 with sercore.FileOutStream(work_path) as s: s.write_string(self.filename) s.write(struct.pack(">q", self.offset)) s.write(struct.pack(">q", self.length)) size = os.stat(work_path).st_size with sercore.FileInStream(work_path) as s: self.raw_split = s.read(size) shutil.rmtree(work_dir) def test_standard(self): t = sercore.deserialize_file_split(self.raw_split) self.assertEqual(len(t), 3) self.assertEqual(t[0], self.filename) self.assertEqual(t[1], self.offset) self.assertEqual(t[2], self.length) def test_errors(self): with self.assertRaises(IOError): sercore.deserialize_file_split(self.raw_split[:-1]) CASES = [ TestFileSplit, ] def suite(): ret = unittest.TestSuite() test_loader = unittest.TestLoader() for c in CASES: ret.addTest(test_loader.loadTestsFromTestCase(c)) return ret if __name__ == '__main__': unittest.TextTestRunner(verbosity=2).run((suite())) ================================================ FILE: test/sercore/test_streams.py ================================================ # BEGIN_COPYRIGHT # # Copyright 2009-2026 CRS4. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import io import os import shutil import struct import tempfile import unittest import uuid from random import randint from pydoop.mapreduce.binary_protocol import OUTPUT, PARTITIONED_OUTPUT import pydoop.sercore as sercore INT64_MIN = -2**63 INT64_MAX = 2**63 - 1 # TODO: from pydoop.test_utils import UNI_CHR UNI_CHR = u'\N{CYRILLIC CAPITAL LETTER O WITH DIAERESIS}' class TestFileInStream(unittest.TestCase): def setUp(self): with io.open(__file__, "rb") as f: self.data = f.read() def test_from_path(self): with sercore.FileInStream(__file__) as s: self.__check_stream(s) def test_from_file(self): with io.open(__file__, "rb") as f: with sercore.FileInStream(f) as s: self.__check_stream(s) def test_errors(self): with self.assertRaises(IOError): sercore.FileInStream(uuid.uuid4().hex) with sercore.FileInStream(__file__) as s: s.skip(len(self.data)) with self.assertRaises(IOError): s.read(1) def __check_stream(self, s): self.assertEqual(s.read(10), self.data[:10]) s.skip(20) self.assertEqual(s.read(20), self.data[30:50]) class TestFileOutStream(unittest.TestCase): def setUp(self): self.wd = tempfile.mkdtemp(prefix="pydoop_") self.fname = os.path.join(self.wd, "foo") self.data = b"abcdefgh" def tearDown(self): shutil.rmtree(self.wd) def test_from_path(self): with sercore.FileOutStream(self.fname) as s: self.__fill_stream(s) self.__check_stream() def test_from_file(self): with io.open(self.fname, "wb") as f: with sercore.FileOutStream(f) as s: self.__fill_stream(s) self.__check_stream() def test_errors(self): with self.assertRaises(IOError): sercore.FileOutStream(os.path.join(uuid.uuid4().hex, "foo")) def __fill_stream(self, s): s.write(self.data) s.flush() s.advance(10) s.write(self.data) def __check_stream(self): with io.open(self.fname, "rb") as f: self.assertEqual(f.read(), self.data + 10 * b'\x00' + self.data) class TestSerDe(unittest.TestCase): INT = 42 LONG = INT64_MAX FLOAT = 3.14 STRING = u'BO' + UNI_CHR BYTES = b'a\x00b' # bytes r/w methods MUST preserve null characters TUPLE = INT, LONG, FLOAT, STRING, BYTES def setUp(self): self.wd = tempfile.mkdtemp(prefix="pydoop_") self.fname = os.path.join(self.wd, "foo") def tearDown(self): shutil.rmtree(self.wd) def test_vint(self): with sercore.FileOutStream(self.fname) as s: s.write_vint(self.INT) with sercore.FileInStream(self.fname) as s: self.assertEqual(s.read_vint(), self.INT) def test_vlong(self): with sercore.FileOutStream(self.fname) as s: s.write_vlong(self.LONG) with sercore.FileInStream(self.fname) as s: self.assertEqual(s.read_vlong(), self.LONG) def test_float(self): with sercore.FileOutStream(self.fname) as s: s.write_float(self.FLOAT) with sercore.FileInStream(self.fname) as s: self.assertAlmostEqual(s.read_float(), self.FLOAT, 3) def test_string_as_string(self): with sercore.FileOutStream(self.fname) as s: s.write_string(self.STRING) with sercore.FileInStream(self.fname) as s: self.assertEqual(s.read_string(), self.STRING) def test_string_as_bytes(self): with sercore.FileOutStream(self.fname) as s: s.write_string(self.STRING) with sercore.FileInStream(self.fname) as s: self.assertEqual(s.read_bytes(), self.STRING.encode("utf8")) def test_bytes_as_string(self): with sercore.FileOutStream(self.fname) as s: s.write_bytes(self.BYTES) with sercore.FileInStream(self.fname) as s: self.assertEqual(s.read_string(), self.BYTES.decode("utf8")) def test_bytes_as_bytes(self): with sercore.FileOutStream(self.fname) as s: s.write_bytes(self.BYTES) with sercore.FileInStream(self.fname) as s: self.assertEqual(s.read_bytes(), self.BYTES) def test_output(self): k, v = b"key", b"value" with sercore.FileOutStream(self.fname) as s: s.write_output(k, v) with sercore.FileInStream(self.fname) as s: self.assertEqual(s.read_vint(), OUTPUT) self.assertEqual(s.read_bytes(), k) self.assertEqual(s.read_bytes(), v) part = 1 with sercore.FileOutStream(self.fname) as s: s.write_output(k, v, part) with sercore.FileInStream(self.fname) as s: self.assertEqual(s.read_vint(), PARTITIONED_OUTPUT) self.assertEqual(s.read_vint(), part) self.assertEqual(s.read_bytes(), k) self.assertEqual(s.read_bytes(), v) def test_multi_no_tuple(self): self.__fill_stream_multi() self.__check_stream_multi() def test_multi_read_tuple(self): self.__fill_stream_multi() self.__check_stream_tuple() def test_multi_write_tuple(self): self.__fill_stream_tuple() self.__check_stream_multi() def test_multi_rw_tuple(self): self.__fill_stream_tuple() self.__check_stream_tuple() def __fill_stream_multi(self): with sercore.FileOutStream(self.fname) as s: s.write_vint(self.INT) s.write_vlong(self.LONG) s.write_float(self.FLOAT) s.write_string(self.STRING) s.write_bytes(self.BYTES) def __fill_stream_tuple(self): with sercore.FileOutStream(self.fname) as s: s.write_tuple("ilfsb", self.TUPLE) def __check_stream_multi(self): with sercore.FileInStream(self.fname) as s: self.assertEqual(s.read_vint(), self.INT) self.assertEqual(s.read_vlong(), self.LONG) self.assertAlmostEqual(s.read_float(), self.FLOAT, 3) self.assertEqual(s.read_string(), self.STRING) self.assertEqual(s.read_bytes(), self.BYTES) def __check_stream_tuple(self): with sercore.FileInStream(self.fname) as s: t = s.read_tuple('ilfsb') self.assertEqual(len(t), 5) self.assertEqual(t[0], self.INT) self.assertEqual(t[1], self.LONG) self.assertAlmostEqual(t[2], self.FLOAT, 3) self.assertEqual(t[3], self.STRING) self.assertEqual(t[4], self.BYTES) def test_errors(self): type_mismatches = [ ("vint", 1.), ("vint", "x"), ("vint", b"x"), ("vlong", 1.), ("vlong", "x"), ("vlong", b"x"), ("float", "x"), ("float", b"x"), ("bytes", 1), ("bytes", 1.), ("bytes", u"x"), ("string", 1), ("string", 1.), ] with sercore.FileOutStream(self.fname) as s: for name, val in type_mismatches: meth = getattr(s, "write_%s" % name) self.assertRaises(TypeError, meth, val) self.__fill_stream_tuple() with sercore.FileInStream(self.fname) as s: with self.assertRaises(IOError): s.read_tuple("ilfsbi") # EOF with sercore.FileOutStream(self.fname) as s: with self.assertRaises(ValueError): s.write_tuple("iis", (1, 2)) # not enough items # "extra" features def test_string_keep_zeros(self): pystr = self.BYTES.decode("utf-8") with sercore.FileOutStream(self.fname) as s: s.write_string(pystr) with sercore.FileInStream(self.fname) as s: val = s.read_bytes() self.assertEqual(val, self.BYTES) def test_string_allow_bytes(self): with sercore.FileOutStream(self.fname) as s: s.write_string(self.BYTES) with sercore.FileInStream(self.fname) as s: self.assertEqual(s.read_bytes(), self.BYTES) class TestCheckClosed(unittest.TestCase): def test_instream(self): with sercore.FileInStream(__file__) as stream: pass ops = ( (stream.read, (1,)), (stream.read_vint, ()), (stream.read_vlong, ()), (stream.read_float, ()), (stream.read_string, ()), (stream.read_tuple, ("ii")), (stream.skip, (1,)), ) self.__check(ops) def test_outstream(self): wd = tempfile.mkdtemp(prefix="pydoop_") fname = os.path.join(wd, "foo") with sercore.FileOutStream(fname) as stream: pass ops = ( (stream.write, (b"x",)), (stream.write_vint, (1,)), (stream.write_vlong, (1,)), (stream.write_float, (1.0,)), (stream.write_string, (u"x")), (stream.write_tuple, ("ii", (1, 1))), (stream.advance, (1,)), (stream.flush, ()), ) self.__check(ops) shutil.rmtree(wd) def test_double_close(self): wd = tempfile.mkdtemp(prefix="pydoop_") fname = os.path.join(wd, "foo") with sercore.FileOutStream(fname) as stream: pass stream.close() with sercore.FileInStream(fname) as stream: pass stream.close() with io.open(fname, "wb") as f: with sercore.FileOutStream(f) as stream: pass stream.close() with io.open(fname, "rb") as f: with sercore.FileInStream(f) as stream: pass stream.close() def __check(self, ops): for o, args in ops: self.assertRaises(ValueError, o, *args) class TestHadoopTypes(unittest.TestCase): def setUp(self): self.wd = tempfile.mkdtemp(prefix="pydoop_") self.fname = os.path.join(self.wd, "foo") def test_long_writable(self): preset_data = (INT64_MIN, -100, -1, 0, 1, 100, INT64_MAX) random_data = [randint(INT64_MIN, INT64_MAX) for _ in range(100)] for data in preset_data, random_data: with io.open(self.fname, "wb") as f: f.write(struct.pack(">" + len(data) * "q", *data)) with sercore.FileInStream(self.fname) as stream: for v in data: self.assertEqual(stream.read_long_writable(), v) # payload entry, e.g., TextInputFormat key k = 1000 sk = struct.pack(">q", k) with sercore.FileOutStream(self.fname) as stream: stream.write_bytes(sk) with sercore.FileInStream(self.fname) as stream: self.assertEqual(stream.read_vint(), len(sk)) self.assertEqual(stream.read_long_writable(), k) CASES = [ TestFileInStream, TestFileOutStream, TestSerDe, TestCheckClosed, TestHadoopTypes, ] def suite(): ret = unittest.TestSuite() test_loader = unittest.TestLoader() for c in CASES: ret.addTest(test_loader.loadTestsFromTestCase(c)) return ret if __name__ == '__main__': unittest.TextTestRunner(verbosity=2).run((suite()))