[
  {
    "path": ".github/FUNDING.yml",
    "content": "github: [BurntSushi]\n"
  },
  {
    "path": ".github/workflows/ci.yml",
    "content": "name: ci\non:\n  pull_request:\n  push:\n    branches:\n    - master\n  schedule:\n    - cron: '00 01 * * *'\n\n# The section is needed to drop write-all permissions that are granted on\n# `schedule` event. By specifying any permission explicitly all others are set\n# to none. By using the principle of least privilege the damage a compromised\n# workflow can do (because of an injection or compromised third party tool or\n# action) is restricted. Currently the worklow doesn't need any additional\n# permission except for pulling the code. Adding labels to issues, commenting\n# on pull-requests, etc. may need additional permissions:\n#\n# Syntax for this section:\n# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#permissions\n#\n# Reference for how to assign permissions on a job-by-job basis:\n# https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs\n#\n# Reference for available permissions that we can enable if needed:\n# https://docs.github.com/en/actions/security-guides/automatic-token-authentication#permissions-for-the-github_token\npermissions:\n  # to fetch code (actions/checkout)\n  contents: read\n\njobs:\n  test:\n    runs-on: ${{ matrix.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        include:\n        - build: stable\n          os: ubuntu-latest\n          rust: stable\n        - build: beta\n          os: ubuntu-latest\n          rust: beta\n        - build: nightly\n          os: ubuntu-latest\n          rust: nightly\n        - build: macos\n          os: macos-latest\n          rust: stable\n        - build: win-msvc\n          os: windows-latest\n          rust: stable\n        - build: win-gnu\n          os: windows-latest\n          rust: stable-x86_64-gnu\n    env:\n      RUSTFLAGS: -D warnings\n      RUST_BACKTRACE: 1\n    steps:\n    - name: Checkout repository\n      uses: actions/checkout@v4\n    - name: Install Rust\n      uses: dtolnay/rust-toolchain@master\n      with:\n        toolchain: ${{ matrix.rust }}\n    - run: cargo build --all --verbose\n    - run: cargo doc --all --verbose\n    - run: cargo test --all --verbose\n\n  rustfmt:\n    runs-on: ubuntu-latest\n    steps:\n    - name: Checkout repository\n      uses: actions/checkout@v4\n    - name: Install Rust\n      uses: dtolnay/rust-toolchain@master\n      with:\n        toolchain: stable\n        components: rustfmt\n    - name: Check formatting\n      run: cargo fmt --all --check\n"
  },
  {
    "path": ".gitignore",
    "content": "/target\n/imdb-eval/target\n/imdb-index/target\n**/*.rs.bk\ntags\n/tmp\n"
  },
  {
    "path": "COPYING",
    "content": "This project is dual-licensed under the Unlicense and MIT licenses.\n\nYou may use this code under the terms of either license.\n"
  },
  {
    "path": "Cargo.toml",
    "content": "[package]\nname = \"imdb-rename\"\nversion = \"0.1.6\"  #:version\nauthors = [\"Andrew Gallant <jamslam@gmail.com>\"]\ndescription = \"\"\"\nA command line utility for searching IMDb and renaming your media files.\n\"\"\"\ndocumentation = \"https://github.com/BurntSushi/imdb-rename\"\nhomepage = \"https://github.com/BurntSushi/imdb-rename\"\nrepository = \"https://github.com/BurntSushi/imdb-rename\"\nreadme = \"README.md\"\nkeywords = [\"imdb\", \"movie\", \"index\", \"search\", \"name\"]\nlicense = \"Unlicense/MIT\"\nedition = \"2021\"\n\n[workspace]\nmembers = [\"imdb-eval\", \"imdb-index\"]\n\n[dependencies]\nanyhow = \"1.0.75\"\nbstr = { version = \"1.8.0\", default-features = false, features = [\"std\"] }\nclap = { version = \"2.34.0\", default-features = false }\nflate2 = \"1.0.28\"\nimdb-index = { version = \"0.1.4\", path = \"imdb-index\" }\nlazy_static = \"1.4.0\"\nlog = { version = \"0.4.20\", features = [\"std\"] }\nregex = \"1.10.2\"\ntabwriter = \"1.3.0\"\nureq = { version = \"2.9.1\", default-features = false, features = [\"tls\"] }\nwalkdir = \"2.4.0\"\n\n[profile.release]\ndebug = true\n"
  },
  {
    "path": "LICENSE-MIT",
    "content": "The MIT License (MIT)\n\nCopyright (c) 2015 Andrew Gallant\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in\nall copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\nTHE SOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "imdb-rename\n===========\nA command line tool to rename media files based on titles from IMDb.\nimdb-rename downloads the official IMDb data set and creates a local index to\nuse for fast fuzzy searching.\n\n[![Linux build status](https://api.travis-ci.org/BurntSushi/imdb-rename.svg)](https://travis-ci.org/BurntSushi/imdb-rename)\n[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/imdb-rename?svg=true)](https://ci.appveyor.com/project/BurntSushi/imdb-rename)\n[![](http://meritbadge.herokuapp.com/imdb-rename)](https://crates.io/crates/imdb-rename)\n\nDual-licensed under MIT or the [UNLICENSE](http://unlicense.org).\n\n\n### Installation\n\n**[Archives of precompiled binaries for imdb-rename are available for Windows,\nmacOS and Linux.](https://github.com/BurntSushi/imdb-rename/releases)**\n\nOtherwise, users are expected to compile imdb-rename from source:\n\n```\n$ git clone https://github.com/BurntSushi/imdb-rename\n$ cd imdb-rename\n$ cargo build --release\n$ ./target/release/imdb-rename --help\n```\n\nAlternatively, if you have\n[Cargo installed](https://rustup.rs),\nthen you can install imdb-rename directly from\n[crates.io](https://crates.io):\n\n```\n$ cargo install imdb-rename\n```\n\nimdb-rename's minimum supported Rust version is **1.28.0**.\n\n#### Archlinux\n\nAn aur package is available: [imdb-rename](https://aur.archlinux.org/packages/imdb-rename/).\n\n### Quick example\n\nEver since Season 1 of The Simpsons came out on DVD, I've been collecting them\nand ripping them on to my hard drive. My process is somewhat manual, but I\nwind up with a directory that looks like this:\n\n```\nS18E01.mkv  S18E05.mkv  S18E09.mkv  S18E13.mkv  S18E17.mkv  S18E21.mkv\nS18E02.mkv  S18E06.mkv  S18E10.mkv  S18E14.mkv  S18E18.mkv  S18E22.mkv\nS18E03.mkv  S18E07.mkv  S18E11.mkv  S18E15.mkv  S18E19.mkv\nS18E04.mkv  S18E08.mkv  S18E12.mkv  S18E16.mkv  S18E20.mkv\n```\n\nIt would be much nicer if these files had their proper episode titles.\nimdb-rename can rename these files automatically using episode titles from\nIMDb:\n\n```\n$ imdb-rename -q 'the simpsons {show}' *.mkv\n```\n\nThis command ran a query with the `-q` flag to identify the TV show, provided\nthe files to rename, and... presto!\n\n```\nS18E01 - The Mook, the Chef, the Wife and Her Homer.mkv\nS18E02 - Jazzy & The Pussycats.mkv\nS18E03 - Please Homer, Don't Hammer 'Em.mkv\nS18E04 - Treehouse of Horror XVII.mkv\nS18E05 - G.I. (Annoyed Grunt).mkv\nS18E06 - Moe'N'a Lisa.mkv\nS18E07 - Ice Cream of Margie: With the Light Blue Hair.mkv\nS18E08 - The Haw-Hawed Couple.mkv\nS18E09 - Kill Gil, Vol. 1 & 2.mkv\nS18E10 - The Wife Aquatic.mkv\nS18E11 - Revenge Is a Dish Best Served Three Times.mkv\nS18E12 - Little Big Girl.mkv\nS18E13 - Springfield Up.mkv\nS18E14 - Yokel Chords.mkv\nS18E15 - Rome-old and Juli-eh.mkv\nS18E16 - Homerazzi.mkv\nS18E17 - Marge Gamer.mkv\nS18E18 - The Boys of Bummer.mkv\nS18E19 - Crook and Ladder.mkv\nS18E20 - Stop or My Dog Will Shoot.mkv\nS18E21 - 24 Minutes.mkv\nS18E22 - You Kent Always Say What You Want.mkv\n```\n\n\n### Fancier example\n\nimdb-rename isn't limited to just renaming TV episodes based on season/episode\nnumbers. It can also perform a fuzzy match based on the contents of the\nfile name. For example, given this file:\n\n```\nThor.Ragnarok.2017.1080p.WEB-DL.DD5.1.H264-FGT.mkv\n```\n\nWe can \"clean it up\" and rename it to a nice title like so:\n\n```\n$ imdb-rename Thor.Ragnarok.2017.1080p.WEB-DL.DD5.1.H264-FGT.mkv\n```\n\nwhich gives us:\n\n```\nThor: Ragnarok (2017).mkv\n```\n\n\n### Freeform searching\n\nWe can also use imdb-rename to search IMDb, which is the default behavior\nwhen a `-q/--query` is provided without any file names:\n\n```\n$ imdb-rename -q 'homey loves flanders'\n#     score  id         kind       title                   year  tv\n1     1.000  tt0773646  tvEpisode  Homer Loves Flanders    1994  S05E16 The Simpsons\n2     0.646  tt2101691  tvEpisode  Tiny Loves Flowers      N/A   S02E08 Dinosaur Train\n3     0.568  tt3203408  tvEpisode  Courtney Loves Love     2014  S01E05 Courtney Loves Dallas\n4     0.561  tt1722576  short      In Flanders Fields      2010\n5     0.561  tt2253780  tvSeries   In Vlaamse Velden       2014\n6     0.555  tt4528474  video      My Lovely Homeland      2011\n7     0.551  tt0220646  tvMovie    Moll Flanders           1975\n[... results truncated ...]\n```\n\nNotice that our query had a typo in it. imdb-rename does its best to find the\nmost relevant results. It is also fast. Even though the above query searches\nthrough all 6 million names in IMDb, it runs in under 100ms. This is thanks to\nusing an inverted index memory mapped from disk.\n\n\n### How does it work?\n\nimdb-rename works by downloading\n[approved datasets from IMDb](https://www.imdb.com/interfaces/),\nand creating an inverted index based on ngrams extracted\nfrom the names in IMDb's data. The inverted index provides a\nquick way to search and rank results using techniques from\n[information retrieval](https://nlp.stanford.edu/IR-book/)\nsuch as\n[Okapi-BM25](https://en.wikipedia.org/wiki/Okapi_BM25).\n\n\n### Motivation\n\nMy motivation for building this tool is somewhat idiosyncratic, but three-fold:\n\n1. I find it very convenient to have a tool to rename media files\n   automatically. imdb-rename is my third iteration on this tool. The first was\n   an unpublished hodge podge of Python scripts and a MySQL database. The\n   second was a\n   [Go program with a PostgreSQL database](https://github.com/BurntSushi/goim).\n   The Go program served me well, but IMDb retired their old data format, which\n   required me to build a new tool to adapt.\n2. I've been working on a low-level information retrieval library off-and-on\n   for a couple years, and initially built this tool on top of that library as\n   a form of dogfooding. It didn't work out as well as I'd hoped, so I scrapped\n   the generic library and built out a specific solution tailored to IMDb. I'm\n   no longer dogfooding directly, but I've established a useful baseline.\n3. I want more people to learn about information retrieval, and I believe this\n   tool can serve to teach others. In particular, imdb-rename is a complete\n   end-to-end information retrieval system that is fast, solves a real problem,\n   is only a few thousand lines of code and comes with a built-in\n   evaluation that is easy to run.\n\nThis tool is perhaps a bit over engineered, but I had fun with it. Believe it\nor not, parts of imdb-rename are intentionally simple at the cost of both query\nspeed and size on disk!\n\n\n### Evaluation\n\nIt is possible to run an evaluation to compare the various parameters available\nfor searching. The evaluation system is available as a separate tool called\nimdb-eval, which is included in this repository. To use it, we must first build\nit:\n\n```\n$ git clone https://github.com/BurntSushi/imdb-rename\n$ cd imdb-rename\n$ cargo build --release --all\n$ ./target/release/imdb-eval --help\n```\n\nRunning an evaluation is simple. We can run an evaluation on all combinations\nof scorer and similarity function, along with ngram sizes of 3 and 4 like so:\n(This will use truth data that is built into the `imdb-eval` binary.)\n\n```\n$ ./target/release/imdb-eval --ngram-size 3 --ngram-size 4 | tee eval.csv\n```\n\nThis will output the results of running a search on every item in the truth\ndata. The results include the rank of the expected answer. The results can be\nsummarized into a single score called the\n[Mean Reciprocal Rank](https://en.wikipedia.org/wiki/Mean_reciprocal_rank)\n(which is itself a specific instance of MAP, or mean average precision)\nwith the `--summarize` flag like so:\n\n```\n$ ./target/release/imdb-eval --summarize eval.csv\n```\n\nIf you have [xsv](https://github.com/BurntSushi/xsv) installed, then the\nresults can be easily sorted and formatted:\n\n```\n$ ./target/release/imdb-eval --summarize eval.csv | xsv sort -R -s mrr | xsv table\n```\n\nIf you want to tweak the truth data, then you might consider starting with the\nbundled truth data (assuming you're at the root of the imdb-rename repository):\n\n```\n$ $EDITOR data/eval/truth.toml\n$ ./target/release/imdb-eval --ngram-size 3 --ngram-size 4 --truth data/eval/truth.toml\n```\n\n\n### What does this tool not do?\n\nimdb-rename is tool for renaming media files, and to the extent that searching\nIMDb facilitates renaming files, it is also a search tool. There is no\nintent to develop this further to explore all IMDb data, such as cast/crew\ninformation.\n\nFolks interested in building a different type of IMDb tool may be interested\nin the [`imdb-index`](https://docs.rs/imdb-index) crate, which provides\nprogrammatic access to the index created by imdb-rename.\n\n\n### IMDb licensing\n\nThe data used by imdb-rename is retrieved from\n[IMDb datasets](https://www.imdb.com/interfaces/).\nIn particular, imdb-rename will never scrape imdb.com, and only uses the data\nprovided by IMDb in the `tsv` files.\n\nAdditionally, imdb-rename must only be used for non-commercial and personal\nuses.\n"
  },
  {
    "path": "UNLICENSE",
    "content": "This is free and unencumbered software released into the public domain.\n\nAnyone is free to copy, modify, publish, use, compile, sell, or\ndistribute this software, either in source code form or as a compiled\nbinary, for any purpose, commercial or non-commercial, and by any\nmeans.\n\nIn jurisdictions that recognize copyright laws, the author or authors\nof this software dedicate any and all copyright interest in the\nsoftware to the public domain. We make this dedication for the benefit\nof the public at large and to the detriment of our heirs and\nsuccessors. We intend this dedication to be an overt act of\nrelinquishment in perpetuity of all present and future rights to this\nsoftware under copyright law.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\nEXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\nIN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR\nOTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,\nARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\nOTHER DEALINGS IN THE SOFTWARE.\n\nFor more information, please refer to <http://unlicense.org/>\n"
  },
  {
    "path": "data/eval/truth.toml",
    "content": "[[task]]\nquery = \"the matrix\"\nanswer = \"tt0133093\"\n\n[[task]]\nquery = \"homey the clown\"\nanswer = \"tt0701128\"\n\n[[task]]\nquery = \"homer loves\"\nanswer = \"tt0773646\"\n\n[[task]]\nquery = \"the matrix: revolutions\"\nanswer = \"tt0242653\"\n\n[[task]]\nquery = \"troy\"\nanswer = \"tt0332452\"\n\n[[task]]\nquery = \"o\"\nanswer = \"tt0184791\"\n\n[[task]]\nquery = \"love and basketball\"\nanswer = \"tt0199725\"\n\n[[task]]\nquery = \"the last one\"\nanswer = \"tt0583434\"\n\n[[task]]\nquery = \"pre-destination\"\nanswer = \"tt2397535\"\n\n[[task]]\nquery = \"1 magic christmas\"\nanswer = \"tt0089731\"\n\n[[task]]\nquery = \"xmen the last stand\"\nanswer = \"tt0376994\"\n\n[[task]]\nquery = \"todliche aura\"\nanswer = \"tt0583427\"\n\n[[task]]\nquery = \"her\"\nanswer = \"tt1798709\"\n\n[[task]]\nquery = \"its a wonderful life\"\nanswer = \"tt0038650\"\n\n[[task]]\nquery = \"jason born\"\nanswer = \"tt4196776\"\n\n[[task]]\nquery = \"cpt america first avenger\"\nanswer = \"tt0458339\"\n\n[[task]]\nquery = \"batman vs superman dawn justice\"\nanswer = \"tt2975590\"\n\n[[task]]\nquery = \"nightmare before christmas\"\nanswer = \"tt0107688\"\n\n[[task]]\nquery = \"the man from earth\"\nanswer = \"tt0756683\"\n\n[[task]]\nquery = \"amazing spiderman 2\"\nanswer = \"tt1872181\"\n\n[[task]]\nquery = \"the revanant\"\nanswer = \"tt1663202\"\n\n[[task]]\nquery = \"imaginarium of dr\"\nanswer = \"tt1054606\"\n\n[[task]]\nquery = \"the dark night\"\nanswer = \"tt0468569\"\n\n[[task]]\nquery = \"the simpsons\"\nanswer = \"tt0462538\"\n\n[[task]]\nquery = \"into the bad lands\"\nanswer = \"tt3865236\"\n\n[[task]]\nquery = \"south park bigger\"\nanswer = \"tt0158983\"\n\n[[task]]\nquery = \"game of shadows sherlock\"\nanswer = \"tt1515091\"\n\n[[task]]\nquery = \"ragnarok\"\nanswer = \"tt3501632\"\n\n[[task]]\nquery = \"riddick\"\nanswer = \"tt0296572\"\n\n[[task]]\nquery = \"voyage dawn treader\"\nanswer = \"tt0980970\"\n\n[[task]]\nquery = \"phenomonon\"\nanswer = \"tt0117333\"\n\n[[task]]\nquery = \"ratchet and clank\"\nanswer = \"tt2865120\"\n\n[[task]]\nquery = \"spiderman homecoming\"\nanswer = \"tt2250912\"\n\n[[task]]\nquery = \"sixth sense\"\nanswer = \"tt0167404\"\n\n[[task]]\nquery = \"there will be blood\"\nanswer = \"tt0469494\"\n\n[[task]]\nquery = \"gangs new york\"\nanswer = \"tt0217505\"\n\n[[task]]\nquery = \"first avenger\"\nanswer = \"tt0458339\"\n\n[[task]]\nquery = \"good shepherd\"\nanswer = \"tt0343737\"\n\n[[task]]\nquery = \"gone with the wind\"\nanswer = \"tt0031381\"\n\n[[task]]\nquery = \"bourne identity\"\nanswer = \"tt0258463\"\n\n[[task]]\nquery = \"seinfeld\"\nanswer = \"tt0098904\"\n\n[[task]]\nquery = \"lincoln\"\nanswer = \"tt0443272\"\n\n[[task]]\nquery = \"sherlock\"\nanswer = \"tt1475582\"\n\n[[task]]\nquery = \"skinner's badass song\"\nanswer = \"tt0777150\"\n\n[[task]]\nquery = \"flying hellish\"\nanswer = \"tt0778451\"\n\n[[task]]\nquery = \"springfield files\"\nanswer = \"tt0701263\"\n\n[[task]]\nquery = \"shot mr burns\"\nanswer = \"tt0701295\"\n\n[[task]]\nquery = \"camp krusty\"\nanswer = \"tt0701142\"\n\n[[task]]\nquery = \"the monorail\"\nanswer = \"tt0701173\"\n\n[[task]]\nquery = \"king homer\"\nanswer = \"tt0701144\"\n\n[[task]]\nquery = \"mr. plow\"\nanswer = \"tt0701184\"\n"
  },
  {
    "path": "data/test/small/title.akas.tsv",
    "content": "titleId\tordering\ttitle\tregion\tlanguage\ttypes\tattributes\tisOriginalTitle\ntt0096697\t10\tSimpsonovi\tSI\t\\N\timdbDisplay\t\\N\t0\ntt0096697\t11\tSimpsonovi\tRS\t\\N\timdbDisplay\t\\N\t0\ntt0096697\t12\tThe Simpsons\tUS\t\\N\t\\N\t\\N\t0\ntt0096697\t13\tGia Dinh Simpsons\tVN\t\\N\timdbDisplay\t\\N\t0\ntt0096697\t14\tSimpsonovci\tSK\t\\N\t\\N\t\\N\t0\ntt0096697\t15\tOs Simpsons\tBR\t\\N\t\\N\t\\N\t0\ntt0096697\t16\tSimpsons\tSE\t\\N\timdbDisplay\t\\N\t0\ntt0096697\t17\tSimpsoni\tHR\t\\N\t\\N\t\\N\t0\ntt0096697\t18\tSimpsoni\tLV\t\\N\timdbDisplay\t\\N\t0\ntt0096697\t19\tDie Simpsons\tXWG\t\\N\t\\N\t\\N\t0\ntt0096697\t1\tLos Simpson\tMX\t\\N\t\\N\t\\N\t0\ntt0096697\t20\tSimpsonovi\tCSHH\t\\N\timdbDisplay\t\\N\t0\ntt0096697\t21\tСемейство Симпсън\tBG\tbg\t\\N\t\\N\t0\ntt0096697\t22\tEls Simpson\tES\tca\timdbDisplay\t\\N\t0\ntt0096697\t23\tThe Simpsons\tGR\t\\N\t\\N\t\\N\t0\ntt0096697\t24\tСiмпсони\tUA\t\\N\t\\N\t\\N\t0\ntt0096697\t25\tSimpsonid\tEE\t\\N\t\\N\t\\N\t0\ntt0096697\t26\tLos Simpson\tES\t\\N\timdbDisplay\t\\N\t0\ntt0096697\t27\tSimpsonowie\tPL\t\\N\timdbDisplay\t\\N\t0\ntt0096697\t28\tOs Simpsons\tPT\t\\N\t\\N\t\\N\t0\ntt0096697\t29\tI Simpson\tIT\t\\N\t\\N\t\\N\t0\ntt0096697\t2\tThe Simpsons\t\\N\t\\N\toriginal\t\\N\t1\ntt0096697\t30\tLes Simpson\tCA\tfr\t\\N\tdubbed version\t0\ntt0096697\t31\tSimpsons\tNO\t\\N\t\\N\t\\N\t0\ntt0096697\t32\tA Simpson család\tHU\t\\N\t\\N\t\\N\t0\ntt0096697\t33\tAl shamshoon\tEG\tar\t\\N\tdubbed version\t0\ntt0096697\t34\tDie Simpsons\tDE\t\\N\timdbDisplay\t\\N\t0\ntt0096697\t35\tFamilia Simpson\tRO\t\\N\t\\N\t\\N\t0\ntt0096697\t36\tLos Simpson\tPE\t\\N\timdbDisplay\t\\N\t0\ntt0096697\t37\tSimpsonai\tLT\t\\N\timdbDisplay\t\\N\t0\ntt0096697\t38\tLes Simpson\tFR\t\\N\t\\N\t\\N\t0\ntt0096697\t3\tLos Simpson\tAR\t\\N\t\\N\t\\N\t0\ntt0096697\t4\tСимпсоны\tRU\t\\N\t\\N\t\\N\t0\ntt0096697\t5\tLos Simpson\tVE\t\\N\t\\N\t\\N\t0\ntt0096697\t6\tSimpson Ailesi\tTR\ttr\timdbDisplay\t\\N\t0\ntt0096697\t7\tSimpsons\tDK\t\\N\t\\N\t\\N\t0\ntt0096697\t8\tSimpsonit\tFI\t\\N\t\\N\t\\N\t0\ntt0096697\t9\tSimpsonovi\tCZ\t\\N\timdbDisplay\t\\N\t0\n"
  },
  {
    "path": "data/test/small/title.basics.tsv",
    "content": "tconst\ttitleType\tprimaryTitle\toriginalTitle\tisAdult\tstartYear\tendYear\truntimeMinutes\tgenres\ntt0348034\ttvEpisode\tSimpsons Roasting on an Open Fire\tSimpsons Roasting on an Open Fire\t0\t1989\t\\N\t30\tAnimation,Comedy\ntt0701059\ttvEpisode\tBart the General\tBart the General\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0701060\ttvEpisode\tBart the Murderer\tBart the Murderer\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0701062\ttvEpisode\tBart vs. Thanksgiving\tBart vs. Thanksgiving\t0\t1990\t\\N\t23\tAnimation,Comedy\ntt0701063\ttvEpisode\tBart's Dog Gets an F\tBart's Dog Gets an F\t0\t1991\t\\N\t23\tAnimation,Comedy\ntt0701064\ttvEpisode\tBart's Friend Falls in Love\tBart's Friend Falls in Love\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0701070\ttvEpisode\tBlack Widower\tBlack Widower\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0701076\ttvEpisode\tBrother, Can You Spare Two Dimes?\tBrother, Can You Spare Two Dimes?\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0701077\ttvEpisode\tBrush with Greatness\tBrush with Greatness\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0701082\ttvEpisode\tColonel Homer\tColonel Homer\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0701084\ttvEpisode\tDancin' Homer\tDancin' Homer\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0701098\ttvEpisode\tFlaming Moe's\tFlaming Moe's\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0701110\ttvEpisode\tHomer Defined\tHomer Defined\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0701114\ttvEpisode\tHomer at the Bat\tHomer at the Bat\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0701123\ttvEpisode\tHomer's Night Out\tHomer's Night Out\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0701124\ttvEpisode\tHomer's Odyssey\tHomer's Odyssey\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0701140\ttvEpisode\tItchy and Scratchy and Marge\tItchy and Scratchy and Marge\t0\t1990\t\\N\t23\tAnimation,Comedy\ntt0701147\ttvEpisode\tKrusty Gets Busted\tKrusty Gets Busted\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0701152\ttvEpisode\tLife on the Fast Lane\tLife on the Fast Lane\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0701153\ttvEpisode\tLike Father, Like Clown\tLike Father, Like Clown\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0701161\ttvEpisode\tLisa's Pony\tLisa's Pony\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0701164\ttvEpisode\tLisa's Substitute\tLisa's Substitute\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0701178\ttvEpisode\tMoaning Lisa\tMoaning Lisa\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0701183\ttvEpisode\tMr. Lisa Goes to Washington\tMr. Lisa Goes to Washington\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0701191\ttvEpisode\tOh Brother, Where Art Thou?\tOh Brother, Where Art Thou?\t0\t1991\t\\N\t23\tAnimation,Comedy\ntt0701192\ttvEpisode\tOld Money\tOld Money\t0\t1991\t\\N\t23\tAnimation,Comedy\ntt0701195\ttvEpisode\tOne Fish, Two Fish, Blowfish, Blue Fish\tOne Fish, Two Fish, Blowfish, Blue Fish\t0\t1991\t\\N\t23\tAnimation,Comedy\ntt0701200\ttvEpisode\tRadio Bart\tRadio Bart\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0701204\ttvEpisode\tSeparate Vocations\tSeparate Vocations\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0701211\ttvEpisode\tSimpson and Delilah\tSimpson and Delilah\t0\t1990\t\\N\t23\tAnimation,Comedy\ntt0701215\ttvEpisode\tSome Enchanted Evening\tSome Enchanted Evening\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0701217\ttvEpisode\tStark Raving Dad\tStark Raving Dad\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0701228\ttvEpisode\tThe Call of the Simpsons\tThe Call of the Simpsons\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0701232\ttvEpisode\tThe Crepes of Wrath\tThe Crepes of Wrath\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0701254\ttvEpisode\tThe Otto Show\tThe Otto Show\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0701269\ttvEpisode\tThe Way We Was\tThe Way We Was\t0\t1991\t\\N\t23\tAnimation,Comedy\ntt0701275\ttvEpisode\tThree Men and a Comic Book\tThree Men and a Comic Book\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0701278\ttvEpisode\tTreehouse of Horror\tTreehouse of Horror\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0756398\ttvEpisode\tThe Telltale Head\tThe Telltale Head\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0756399\ttvEpisode\tThere's No Disgrace Like Home\tThere's No Disgrace Like Home\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0756593\ttvEpisode\tBart the Genius\tBart the Genius\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0757017\ttvEpisode\tBart Gets Hit by a Car\tBart Gets Hit by a Car\t0\t1991\t\\N\t23\tAnimation,Comedy\ntt0757023\ttvEpisode\tTwo Cars in Every Garage and Three Eyes on Every Fish\tTwo Cars in Every Garage and Three Eyes on Every Fish\t0\t1990\t\\N\t23\tAnimation,Comedy\ntt0759267\ttvEpisode\tTreehouse of Horror II\tTreehouse of Horror II\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0763024\ttvEpisode\tBart Gets an F\tBart Gets an F\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0763042\ttvEpisode\tWhen Flanders Failed\tWhen Flanders Failed\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0766140\ttvEpisode\tThe War of the Simpsons\tThe War of the Simpsons\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0767438\ttvEpisode\tBart the Daredevil\tBart the Daredevil\t0\t1990\t\\N\t23\tAnimation,Comedy\ntt0767440\ttvEpisode\tBlood Feud\tBlood Feud\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0767442\ttvEpisode\tDead Putting Society\tDead Putting Society\t0\t1990\t\\N\t30\tAnimation,Comedy\ntt0767443\ttvEpisode\tHomer vs. Lisa and the 8th Commandment\tHomer vs. Lisa and the 8th Commandment\t0\t1991\t\\N\t23\tAnimation,Comedy\ntt0767445\ttvEpisode\tPrincipal Charming\tPrincipal Charming\t0\t1991\t\\N\t23\tAnimation,Comedy\ntt0768553\ttvEpisode\tBart the Lover\tBart the Lover\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0768554\ttvEpisode\tDog of Death\tDog of Death\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0768555\ttvEpisode\tHomer Alone\tHomer Alone\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0768556\ttvEpisode\tI Married Marge\tI Married Marge\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0768557\ttvEpisode\tLisa the Greek\tLisa the Greek\t0\t1992\t\\N\t30\tAnimation,Comedy\ntt0768558\ttvEpisode\tSaturdays of Thunder\tSaturdays of Thunder\t0\t1991\t\\N\t30\tAnimation,Comedy\ntt0769743\ttvEpisode\tBurns Verkaufen der Kraftwerk\tBurns Verkaufen der Kraftwerk\t0\t1991\t\\N\t30\tAnimation,Comedy\n"
  },
  {
    "path": "data/test/small/title.episode.tsv",
    "content": "tconst\tparentTconst\tseasonNumber\tepisodeNumber\ntt0348034\ttt0096697\t1\t1\ntt0701059\ttt0096697\t1\t5\ntt0701060\ttt0096697\t3\t4\ntt0701062\ttt0096697\t2\t7\ntt0701063\ttt0096697\t2\t16\ntt0701064\ttt0096697\t3\t23\ntt0701070\ttt0096697\t3\t21\ntt0701076\ttt0096697\t3\t24\ntt0701077\ttt0096697\t2\t18\ntt0701082\ttt0096697\t3\t20\ntt0701084\ttt0096697\t2\t5\ntt0701098\ttt0096697\t3\t10\ntt0701110\ttt0096697\t3\t5\ntt0701114\ttt0096697\t3\t17\ntt0701123\ttt0096697\t1\t10\ntt0701124\ttt0096697\t1\t3\ntt0701140\ttt0096697\t2\t9\ntt0701147\ttt0096697\t1\t12\ntt0701152\ttt0096697\t1\t9\ntt0701153\ttt0096697\t3\t6\ntt0701161\ttt0096697\t3\t8\ntt0701164\ttt0096697\t2\t19\ntt0701178\ttt0096697\t1\t6\ntt0701183\ttt0096697\t3\t2\ntt0701191\ttt0096697\t2\t15\ntt0701192\ttt0096697\t2\t17\ntt0701195\ttt0096697\t2\t11\ntt0701200\ttt0096697\t3\t13\ntt0701204\ttt0096697\t3\t18\ntt0701211\ttt0096697\t2\t2\ntt0701215\ttt0096697\t1\t13\ntt0701217\ttt0096697\t3\t1\ntt0701228\ttt0096697\t1\t7\ntt0701232\ttt0096697\t1\t11\ntt0701254\ttt0096697\t3\t22\ntt0701269\ttt0096697\t2\t12\ntt0701275\ttt0096697\t2\t21\ntt0701278\ttt0096697\t2\t3\ntt0756398\ttt0096697\t1\t8\ntt0756399\ttt0096697\t1\t4\ntt0756593\ttt0096697\t1\t2\ntt0757017\ttt0096697\t2\t10\ntt0757023\ttt0096697\t2\t4\ntt0759267\ttt0096697\t3\t7\ntt0763024\ttt0096697\t2\t1\ntt0763042\ttt0096697\t3\t3\ntt0766140\ttt0096697\t2\t20\ntt0767438\ttt0096697\t2\t8\ntt0767440\ttt0096697\t2\t22\ntt0767442\ttt0096697\t2\t6\ntt0767443\ttt0096697\t2\t13\ntt0767445\ttt0096697\t2\t14\ntt0768553\ttt0096697\t3\t16\ntt0768554\ttt0096697\t3\t19\ntt0768555\ttt0096697\t3\t15\ntt0768556\ttt0096697\t3\t12\ntt0768557\ttt0096697\t3\t14\ntt0768558\ttt0096697\t3\t9\ntt0769743\ttt0096697\t3\t11\n"
  },
  {
    "path": "data/test/small/title.ratings.tsv",
    "content": "tconst\taverageRating\tnumVotes\ntt0000001\t5.8\t1356\ntt0000002\t6.5\t157\ntt0000003\t6.6\t939\ntt0000004\t6.4\t93\ntt0000005\t6.2\t1630\ntt0000006\t5.6\t79\ntt0000007\t5.5\t546\ntt0000008\t5.6\t1454\ntt0000009\t5.4\t62\ntt0000010\t6.9\t4880\ntt0000011\t5.4\t193\ntt0000012\t7.4\t8102\ntt0000013\t5.7\t1239\ntt0000014\t7.2\t3542\ntt0000015\t6.2\t606\ntt0000016\t5.9\t922\ntt0000017\t4.8\t181\ntt0000018\t5.5\t389\ntt0000019\t6.7\t12\ntt0000020\t5.1\t219\ntt0000022\t5.1\t703\ntt0000023\t5.7\t875\ntt0000024\t5.8\t18\ntt0000025\t5.0\t14\ntt0000026\t5.7\t1086\n"
  },
  {
    "path": "imdb-eval/COPYING",
    "content": "This project is dual-licensed under the Unlicense and MIT licenses.\n\nYou may use this code under the terms of either license.\n"
  },
  {
    "path": "imdb-eval/Cargo.toml",
    "content": "[package]\nname = \"imdb-eval\"\nversion = \"0.1.2\"\nauthors = [\"Andrew Gallant <jamslam@gmail.com>\"]\ndescription = \"\"\"\nA command line utility for evaluating the IMDb name index.\n\"\"\"\ndocumentation = \"https://github.com/BurntSushi/imdb-rename\"\nhomepage = \"https://github.com/BurntSushi/imdb-rename\"\nrepository = \"https://github.com/BurntSushi/imdb-rename\"\nreadme = \"README.md\"\nkeywords = [\"imdb\", \"index\", \"search\", \"name\", \"evaluation\"]\nlicense = \"Unlicense/MIT\"\nedition = \"2021\"\n\n[dependencies]\nanyhow = \"1.0.75\"\nclap = { version = \"2.34.0\", default-features = false }\ncsv = \"1.3.0\"\nimdb-index = { version = \"0.1.4\", path = \"../imdb-index\" }\nlazy_static = \"1.4.0\"\nlog = { version = \"0.4.20\", features = [\"std\"] }\nserde = { version = \"1.0.193\", features = [\"derive\"] }\ntoml = \"0.8.8\"\n"
  },
  {
    "path": "imdb-eval/LICENSE-MIT",
    "content": "The MIT License (MIT)\n\nCopyright (c) 2015 Andrew Gallant\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in\nall copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\nTHE SOFTWARE.\n"
  },
  {
    "path": "imdb-eval/README.md",
    "content": "imdb-eval\n=========\nA command line tool for evaluating imdb-rename's search functionality.\n\n[![Linux build status](https://api.travis-ci.org/BurntSushi/imdb-rename.png)](https://travis-ci.org/BurntSushi/imdb-rename)\n[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/imdb-rename?svg=true)](https://ci.appveyor.com/project/BurntSushi/imdb-rename)\n[![](http://meritbadge.herokuapp.com/imdb-rename)](https://crates.io/crates/imdb-rename)\n\n\n### Installation\n\nNo release binaries are provided for imdb-eval. Instead, users should compile\nit from source:\n\n```\n$ git clone https://github.com/BurntSushi/imdb-rename\n$ cd imdb-rename\n$ cargo build --release --all\n$ ./target/release/imdb-eval --help\n```\n\nFor more details on how to use imdb-eval, please see imdb-rename's README.\n"
  },
  {
    "path": "imdb-eval/UNLICENSE",
    "content": "This is free and unencumbered software released into the public domain.\n\nAnyone is free to copy, modify, publish, use, compile, sell, or\ndistribute this software, either in source code form or as a compiled\nbinary, for any purpose, commercial or non-commercial, and by any\nmeans.\n\nIn jurisdictions that recognize copyright laws, the author or authors\nof this software dedicate any and all copyright interest in the\nsoftware to the public domain. We make this dedication for the benefit\nof the public at large and to the detriment of our heirs and\nsuccessors. We intend this dedication to be an overt act of\nrelinquishment in perpetuity of all present and future rights to this\nsoftware under copyright law.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\nEXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\nIN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR\nOTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,\nARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\nOTHER DEALINGS IN THE SOFTWARE.\n\nFor more information, please refer to <http://unlicense.org/>\n"
  },
  {
    "path": "imdb-eval/src/eval.rs",
    "content": "use std::collections::BTreeMap;\nuse std::fmt;\nuse std::fs::File;\nuse std::io::Read;\nuse std::path::{Path, PathBuf};\nuse std::time::{Duration, Instant};\nuse std::vec;\n\nuse imdb_index::{\n    Index, IndexBuilder, MediaEntity, NameScorer, NgramType, Query, Searcher,\n    Similarity,\n};\nuse lazy_static::lazy_static;\nuse serde::{Deserialize, Serialize};\n\n/// The default truth data used in an evaluation. It's small enough that we\n/// embed it directly into the binary.\nconst TRUTH_DATA: &str = include_str!(\"../../data/eval/truth.toml\");\n\nlazy_static! {\n    /// A structured representation of the default truth data.\n    static ref TRUTH: Truth = toml::from_str(TRUTH_DATA).unwrap();\n}\n\n/// The truth data for our evaluation.\n///\n/// The truth data consists of a set of information needs that we call \"tasks.\"\n#[derive(Clone, Debug, Deserialize)]\nstruct Truth {\n    #[serde(rename = \"task\")]\n    tasks: Vec<Task>,\n}\n\n/// A task or \"information need\" defined by the truth data. Each task\n/// corresponds to a query that we feed to the name index, and each task has a\n/// single correct answer.\n#[derive(Clone, Debug, Deserialize)]\nstruct Task {\n    query: String,\n    answer: String,\n}\n\nimpl Truth {\n    /// Load truth data from the given TOML file.\n    fn from_path<P: AsRef<Path>>(path: P) -> anyhow::Result<Truth> {\n        let path = path.as_ref();\n\n        let mut contents = String::new();\n        File::open(path)?.read_to_string(&mut contents)?;\n        Ok(toml::from_str(&contents)?)\n    }\n}\n\n/// A specification for running an evaluation. Fundamentally, a specification\n/// describes the thing we want to evaluate, where the thing we want to\n/// evaluate is a specific configuration of how we build *and* search an IMDb\n/// index.\n///\n/// A specification describes both how the index should be built and how\n/// queries should be generated. Specifications with equivalent index settings\n/// may reuse the same on-disk index. For example, the ngram size and type are\n/// index settings, but the similarity function, name scorer and result size\n/// are all query time settings.\n///\n/// A specification cannot itself produce a complete query. Namely, a\n/// specification requires an information need (called a \"task\") to construct\n/// a query specific to that need. The results of that query are then compared\n/// with that information need's answer to determine the score, which is,\n/// invariably, a reflection of how well the configuration given by this\n/// specification performs.\n#[derive(Clone, Debug, Eq, PartialEq)]\npub struct Spec {\n    result_size: usize,\n    ngram_size: usize,\n    ngram_type: NgramType,\n    sim: Similarity,\n    scorer: Option<NameScorer>,\n}\n\nimpl Spec {\n    /// Create a new spec using a default configuration.\n    pub fn new() -> Spec {\n        Spec {\n            result_size: 30,\n            ngram_size: 3,\n            ngram_type: NgramType::default(),\n            sim: Similarity::None,\n            scorer: Some(NameScorer::OkapiBM25),\n        }\n    }\n\n    /// Set the result size for this specification.\n    ///\n    /// This returns an error if the given size is less than `1`.\n    pub fn with_result_size(\n        mut self,\n        result_size: usize,\n    ) -> anyhow::Result<Spec> {\n        if result_size < 1 {\n            anyhow::bail!(\n                \"result size {} is invalid, must be greater than 0\",\n                result_size\n            );\n        }\n        self.result_size = result_size;\n        Ok(self)\n    }\n\n    /// Set the ngram size for this specification.\n    ///\n    /// This returns an error if the given size is less than `2`.\n    pub fn with_ngram_size(\n        mut self,\n        ngram_size: usize,\n    ) -> anyhow::Result<Spec> {\n        if ngram_size < 2 {\n            anyhow::bail!(\n                \"ngram size {} is invalid, must be greater than 1\",\n                ngram_size,\n            );\n        }\n        self.ngram_size = ngram_size;\n        Ok(self)\n    }\n\n    /// Set the ngram type for this specification.\n    pub fn with_ngram_type(mut self, ngram_type: NgramType) -> Spec {\n        self.ngram_type = ngram_type;\n        self\n    }\n\n    /// Set the similarity ranker function for this specification.\n    pub fn with_similarity(mut self, sim: Similarity) -> Spec {\n        self.sim = sim;\n        self\n    }\n\n    /// Set the name scorer for this specification.\n    ///\n    /// Note that if the given scorer is `None`, then an evaluation will likely\n    /// be quite slow, since each information need will result in an exhaustive\n    /// search of the corpus.\n    pub fn with_scorer(mut self, scorer: Option<NameScorer>) -> Spec {\n        self.scorer = scorer;\n        self\n    }\n\n    /// Evaluate this specification against the built-in truth data.\n    pub fn evaluate<P1: AsRef<Path>, P2: AsRef<Path>>(\n        &self,\n        data_dir: P1,\n        eval_dir: P2,\n    ) -> anyhow::Result<Evaluation> {\n        let searcher = Searcher::new(self.index(data_dir, eval_dir)?);\n        Ok(Evaluation {\n            evaluator: Evaluator { spec: self, searcher },\n            tasks: TRUTH.clone().tasks.into_iter(),\n        })\n    }\n\n    /// Evaluate this specification against a set of truth data at the given\n    /// file path.\n    pub fn evaluate_with<P1: AsRef<Path>, P2: AsRef<Path>, P3: AsRef<Path>>(\n        &self,\n        data_dir: P1,\n        eval_dir: P2,\n        truth_path: P3,\n    ) -> anyhow::Result<Evaluation> {\n        let searcher = Searcher::new(self.index(data_dir, eval_dir)?);\n        Ok(Evaluation {\n            evaluator: Evaluator { spec: self, searcher },\n            tasks: Truth::from_path(truth_path)?.tasks.into_iter(),\n        })\n    }\n\n    /// Create a query derived from this specification and a particular\n    /// information need or \"task.\"\n    fn query(&self, task: &Task) -> Query {\n        Query::new()\n            .name(&task.query)\n            .name_scorer(self.scorer.clone())\n            .similarity(self.sim.clone())\n            .size(self.result_size)\n    }\n\n    /// Either open or create an index suitable for this specification.\n    ///\n    /// If no index exists in the expected sub-directory of `eval_dir`, then\n    /// a new index is created.\n    fn index<P1: AsRef<Path>, P2: AsRef<Path>>(\n        &self,\n        data_dir: P1,\n        eval_dir: P2,\n    ) -> anyhow::Result<Index> {\n        let index_dir = self.index_dir(eval_dir.as_ref());\n        Ok(if index_dir.exists() {\n            Index::open(data_dir, index_dir)?\n        } else {\n            IndexBuilder::new()\n                .ngram_size(self.ngram_size)\n                .ngram_type(self.ngram_type)\n                .create(data_dir, index_dir)?\n        })\n    }\n\n    /// The sub-directory of `eval_dir` in which to store this specification's\n    /// index.\n    fn index_dir<P: AsRef<Path>>(&self, eval_dir: P) -> PathBuf {\n        eval_dir.as_ref().join(self.index_name())\n    }\n\n    /// The expected name of the index for this evaluation specification.\n    ///\n    /// The name of the index is derived specifically from this specification's\n    /// index-time settings, such as the ngram size. This permits multiple\n    /// distinct specifications to reuse the same index.\n    fn index_name(&self) -> String {\n        format!(\"ngram-{}_ngram-type-{}\", self.ngram_size, self.ngram_type)\n    }\n}\n\nimpl Default for Spec {\n    fn default() -> Spec {\n        Spec::new()\n    }\n}\n\nimpl fmt::Display for Spec {\n    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {\n        let scorer = match self.scorer {\n            None => \"none\".to_string(),\n            Some(ref scorer) => scorer.to_string(),\n        };\n        write!(\n            f,\n            \"size-{}_ngram-{}_ngram-type-{}_sim-{}_scorer-{}\",\n            self.result_size,\n            self.ngram_size,\n            self.ngram_type,\n            self.sim,\n            scorer,\n        )\n    }\n}\n\n/// A summary of the results of evaluating every information need or \"task\" for\n/// a single evaluation specification. The summary boils the quality of the\n/// specification down to two figures: the mean reciprocal rank and the ratio\n/// of tasks that produced an answer.\n///\n/// The mean reciprocal rank measures the average precision of the\n/// specification. That is, it measures how well we answer the following\n/// question: \"If your search produced the correct answer, how highly was it\n/// ranked?\"\n///\n/// The ratio of tasks that produced an answer measures how well we answer the\n/// following question: \"Of the searches ran, how many of them produced the\n/// correct result at any rank?\"\n///\n/// Implicit in the evaluation is the notion of a bounded number of results.\n/// That is, every specification dictates the maximum number of results\n/// returned by a search. If the answer isn't in that result set, then we stop\n/// there and declare that the answer wasn't found.\n///\n/// The reason for using two different scores is so that they counter balance\n/// each other. Namely, a specification that does really well on a smaller\n/// number of results might end up with a higher MRR than other specifications,\n/// but will have a lower ratio of successful searches.\n#[derive(Debug, Deserialize, Serialize)]\npub struct Summary {\n    /// The specification name that this result is summarizing.\n    pub name: String,\n    /// Mean reciprocal rank.\n    pub mrr: f64,\n    /// The ratio of tasks that found an answer. The higher the better.\n    pub found: f64,\n}\n\nimpl Summary {\n    /// Returns a group of summaries for all distinct specifications found\n    /// in the back of results given.\n    ///\n    /// If no results are given, then no summaries are returned.\n    pub fn from_task_results(results: &[TaskResult]) -> Vec<Summary> {\n        let mut grouped: BTreeMap<&str, Vec<&TaskResult>> = BTreeMap::new();\n        for result in results {\n            grouped.entry(&result.name).or_insert(vec![]).push(result);\n        }\n\n        let mut summaries = vec![];\n        for results in grouped.values() {\n            summaries.push(Summary::from_same_task_results(results));\n        }\n        summaries\n    }\n\n    /// Returns a summary for a single group of task results. All the results\n    /// given must have the same name, otherwise this panics. This also panics\n    /// if the given results are empty.\n    fn from_same_task_results(results: &[&TaskResult]) -> Summary {\n        assert!(!results.is_empty());\n        assert!(results.iter().all(|r| results[0].name == r.name));\n\n        let mut precision_sum = 0.0;\n        let mut found = 0u64;\n        for r in results {\n            precision_sum += r.rank.map_or(0.0, |rank| 1.0 / (rank as f64));\n            if r.rank.is_some() {\n                found += 1;\n            }\n        }\n        Summary {\n            name: results[0].name.clone(),\n            mrr: precision_sum / (results.len() as f64),\n            found: (found as f64) / (results.len() as f64),\n        }\n    }\n}\n\n/// The result of evaluating a single information need or \"task.\"\n#[derive(Debug, Deserialize, Serialize)]\npub struct TaskResult {\n    /// The name of the evaluation's spec. This name includes all of the\n    /// parameters that influence the evaluation, such as ngram size,\n    /// similarity function, etc.\n    pub name: String,\n    /// The freeform text query, which represents a specific manifestation of\n    /// this information need. Generally speaking, this corresponds to the\n    /// query that an end user will type.\n    pub query: String,\n    /// The IMDb identifier corresponding to a singular answer expected by an\n    /// end user.\n    pub answer: String,\n    /// If the answer appears in the search results, then this corresponds to\n    /// the rank of that search result. The rank is determined by the answer's\n    /// absolute position in the list of ranked search results.\n    ///\n    /// Ties in the ranked list are handled by assigning the maximum possible\n    /// rank to each search result with the same score. For example, if we\n    /// request 30 results and the answer is incidentally 10th in the list but\n    /// every search result has the same score of 1.0, then the rank of our\n    /// answer is 30. (Indeed, the rank of every search result is 30 in this\n    /// example.)\n    pub rank: Option<u64>,\n    /// The time it took to execute this query, in seconds.\n    pub duration_seconds: f64,\n}\n\n/// An evaluation is an iterator over all of the results of evaluating every\n/// information need in the truth data.\n#[derive(Debug)]\npub struct Evaluation<'s> {\n    /// The evaluator, which turns an information need into a `TaskResult`.\n    evaluator: Evaluator<'s>,\n    /// All of the tasks to evaluate.\n    tasks: vec::IntoIter<Task>,\n}\n\nimpl<'s> Iterator for Evaluation<'s> {\n    type Item = anyhow::Result<TaskResult>;\n\n    fn next(&mut self) -> Option<anyhow::Result<TaskResult>> {\n        self.tasks.next().map(|task| self.evaluator.run(&task))\n    }\n}\n\n/// An evaluator is responsible for executing a single search for a single\n/// information need. It records the evaluation of that search result in a\n/// `TaskResult`.\n#[derive(Debug)]\nstruct Evaluator<'s> {\n    /// The evaluation specification.\n    spec: &'s Spec,\n    /// A handle to a searcher for an IMDb index.\n    searcher: Searcher,\n}\n\nimpl<'s> Evaluator<'s> {\n    /// Run this evaluator on a single information need and return the\n    /// evaluation.\n    fn run(&mut self, task: &Task) -> anyhow::Result<TaskResult> {\n        let start = Instant::now();\n        let rank = self.rank(task)?;\n        let duration = Instant::now().duration_since(start);\n        Ok(TaskResult {\n            name: self.spec.to_string(),\n            query: task.query.clone(),\n            answer: task.answer.clone(),\n            rank,\n            duration_seconds: fractional_seconds(&duration),\n        })\n    }\n\n    /// Execute the search for the given information need and determine the\n    /// rank of the expected answer for the given information need. If the\n    /// expected answer didn't appear in the search results, then `None` is\n    /// returned.\n    ///\n    /// The rank of the answer is determined in exactly the way you might\n    /// expect: if the answer appears as the Nth result in a search, then its\n    /// rank is N. There is one tricky part of this, and it is specifically in\n    /// how we break ties. Stated succinctly, we always take the maximum\n    /// possible rank of a result. For example, given the following results,\n    /// where the first column is the score, the second column is the\n    /// result name, and the third column is the *intuitive* rank:\n    ///\n    ///     1.0  a  1\n    ///     1.0  b  1\n    ///     1.0  c  1\n    ///     0.9  d  4\n    ///     0.8  e  5\n    ///     0.8  f  5\n    ///     0.7  g  7\n    ///\n    /// Namely, records that are tied all get assigned the same rank, and the\n    /// next result with a lower score is assigned a rank equivalent to its\n    /// absolute position in the result list.\n    ///\n    /// The problem with this ranking strategy is that it biases toward rankers\n    /// that have a naive score. In particular, so long as a search returns the\n    /// answer in the results, it could assign a score of `1.0` to every\n    /// result and get a maximal RR (Reciprocal Rank) evaluation.\n    ///\n    /// Instead, we invert how results are ranked. The above example is instead\n    /// ranked like so:\n    ///\n    ///     1.0  a  3\n    ///     1.0  b  3\n    ///     1.0  c  3\n    ///     0.9  d  4\n    ///     0.8  e  6\n    ///     0.8  f  6\n    ///     0.7  g  7\n    ///\n    /// In other words, we assign the maximal possible rank instead of the\n    /// minimal possible rank.\n    ///\n    /// There are other strategies, but in general, we want to reward high\n    /// precision rankers.\n    fn rank(&mut self, task: &Task) -> anyhow::Result<Option<u64>> {\n        let results = self.searcher.search(&self.spec.query(&task))?;\n\n        let mut rank = results.len() as u64;\n        let mut prev_score = None;\n        let mut ranked: Vec<(u64, MediaEntity)> = vec![];\n        for (i, scored) in results.into_iter().enumerate().rev() {\n            let (score, entity) = scored.into_pair();\n            if prev_score.map_or(true, |s| !approx_eq(s, score)) {\n                rank = i as u64 + 1;\n                prev_score = Some(score);\n            }\n            ranked.push((rank, entity));\n        }\n        ranked.reverse();\n\n        for (rank, entity) in ranked {\n            if entity.title().id == task.answer {\n                return Ok(Some(rank));\n            }\n        }\n        Ok(None)\n    }\n}\n\n/// Compares two floating point numbers for equality approximately for some\n/// epsilon.\nfn approx_eq(x1: f64, x2: f64) -> bool {\n    // We used a fixed error because it's good enough in practice.\n    (x1 - x2).abs() <= 0.0000000001\n}\n\n/// Returns the number of seconds in this duration in fraction form.\n/// The number to the left of the decimal point is the number of seconds,\n/// and the number to the right is the number of milliseconds.\nfn fractional_seconds(d: &Duration) -> f64 {\n    let fractional = (d.subsec_nanos() as f64) / 1_000_000_000.0;\n    d.as_secs() as f64 + fractional\n}\n\n#[cfg(test)]\nmod tests {\n    use imdb_index::{NameScorer, NgramType, Similarity};\n\n    use super::Spec;\n\n    #[test]\n    fn spec_printer() {\n        let spec = Spec {\n            result_size: 30,\n            ngram_size: 3,\n            ngram_type: NgramType::Window,\n            sim: Similarity::None,\n            scorer: Some(NameScorer::OkapiBM25),\n        };\n        let expected =\n            \"size-30_ngram-3_ngram-type-window_sim-none_scorer-okapibm25\";\n        assert_eq!(spec.to_string(), expected);\n\n        let spec = Spec {\n            result_size: 1,\n            ngram_size: 2,\n            ngram_type: NgramType::Edge,\n            sim: Similarity::Jaro,\n            scorer: None,\n        };\n        let expected = \"size-1_ngram-2_ngram-type-edge_sim-jaro_scorer-none\";\n        assert_eq!(spec.to_string(), expected);\n    }\n}\n"
  },
  {
    "path": "imdb-eval/src/logger.rs",
    "content": "// This module defines a super simple logger that works with the `log` crate.\n// We don't need anything fancy; just basic log levels and the ability to\n// print to stderr. We therefore avoid bringing in extra dependencies just\n// for this functionality.\n\nuse log::Log;\n\nuse anyhow::Result;\n\n/// Initialize a simple logger.\npub fn init() -> Result<()> {\n    Ok(Logger::init()?)\n}\n\n/// The simplest possible logger that logs to stderr.\n///\n/// This logger does no filtering. Instead, it relies on the `log` crates\n/// filtering via its global max_level setting.\n#[derive(Debug)]\nstruct Logger(());\n\nconst LOGGER: &'static Logger = &Logger(());\n\nimpl Logger {\n    /// Create a new logger that logs to stderr and initialize it as the\n    /// global logger. If there was a problem setting the logger, then an\n    /// error is returned.\n    fn init() -> std::result::Result<(), log::SetLoggerError> {\n        log::set_logger(LOGGER)\n    }\n}\n\nimpl Log for Logger {\n    fn enabled(&self, _: &log::Metadata) -> bool {\n        // We set the log level via log::set_max_level, so we don't need to\n        // implement filtering here.\n        true\n    }\n\n    fn log(&self, record: &log::Record) {\n        if !should_log(record) {\n            return;\n        }\n        eprintln!(\"{}: {}\", record.level(), record.args());\n    }\n\n    fn flush(&self) {\n        // We use eprintln! which is flushed on every call.\n    }\n}\n\nfn should_log(record: &log::Record) -> bool {\n    let t = record.target();\n    t.starts_with(\"imdb_rename\") || t.starts_with(\"imdb_index\")\n}\n"
  },
  {
    "path": "imdb-eval/src/main.rs",
    "content": "use std::env;\nuse std::io;\nuse std::path::{Path, PathBuf};\nuse std::process;\nuse std::result;\nuse std::str::FromStr;\n\nuse imdb_index::{NameScorer, NgramType, Similarity};\nuse lazy_static::lazy_static;\n\nuse crate::eval::Spec;\n\nmod eval;\nmod logger;\n\nfn main() {\n    if let Err(err) = try_main() {\n        // A pipe error occurs when the consumer of this process's output has\n        // hung up. This is a normal event, and we should quit gracefully.\n        if is_pipe_error(&err) {\n            process::exit(0);\n        }\n        eprintln!(\"{:?}\", err);\n        process::exit(1);\n    }\n}\n\nfn try_main() -> anyhow::Result<()> {\n    logger::init()?;\n    log::set_max_level(log::LevelFilter::Info);\n\n    let args = Args::from_matches(&app().get_matches())?;\n    if args.debug {\n        log::set_max_level(log::LevelFilter::Debug);\n    }\n    if let Some(ref summarize) = args.summarize {\n        return run_summarize(summarize);\n    } else if args.dry_run {\n        for spec in args.specs()? {\n            println!(\"{}\", spec);\n        }\n        return Ok(());\n    }\n    run_eval(\n        &args.data_dir,\n        &args.eval_dir,\n        args.truth.as_ref().map(|p| p.as_path()),\n        args.specs()?,\n    )\n}\n\n/// Run an evaluation on the IMDb data in `data_dir`, and store any indexes\n/// created for the evaluation in `eval_dir`. If a path to truth data is given,\n/// then the information needs or \"tasks\" used for the evaluation are taken\n/// from that file, otherwise, a built-in truth data set is used.\n///\n/// The specs given each describe the protocol for an evaluation. They each\n/// represent a configuration for how an IMDb index is built and how queries\n/// are constructed. The specification is fundamentally the thing we want to\n/// evaluate. That is, we want to find the \"best\" specification.\nfn run_eval(\n    data_dir: &Path,\n    eval_dir: &Path,\n    truth_path: Option<&Path>,\n    specs: Vec<Spec>,\n) -> anyhow::Result<()> {\n    if !data_dir.exists() {\n        anyhow::bail!(\n            \"data directory {} does not exist; please use \\\n             imdb-rename to create it\",\n            data_dir.display()\n        );\n    }\n\n    let mut wtr = csv::Writer::from_writer(io::stdout());\n    for spec in &specs {\n        let results = match truth_path {\n            None => spec.evaluate(data_dir, eval_dir)?,\n            Some(p) => spec.evaluate_with(data_dir, eval_dir, p)?,\n        };\n        for result in results {\n            wtr.serialize(result?)?;\n            wtr.flush()?;\n        }\n    }\n    Ok(())\n}\n\n/// Summarize the evaluation results at the given path.\nfn run_summarize(summarize: &Path) -> anyhow::Result<()> {\n    let mut results: Vec<eval::TaskResult> = vec![];\n    let mut rdr = csv::Reader::from_path(summarize)?;\n    for result in rdr.deserialize() {\n        results.push(result?);\n    }\n\n    let mut wtr = csv::Writer::from_writer(io::stdout());\n    for summary in eval::Summary::from_task_results(&results) {\n        wtr.serialize(summary)?;\n    }\n    wtr.flush()?;\n    Ok(())\n}\n\n#[derive(Debug)]\nstruct Args {\n    data_dir: PathBuf,\n    debug: bool,\n    dry_run: bool,\n    eval_dir: PathBuf,\n    ngram_sizes: Vec<usize>,\n    ngram_types: Vec<NgramType>,\n    result_sizes: Vec<usize>,\n    scorers: Vec<Option<NameScorer>>,\n    similarities: Vec<Similarity>,\n    summarize: Option<PathBuf>,\n    truth: Option<PathBuf>,\n}\n\nimpl Args {\n    /// Build a structured set of arguments from clap's matches.\n    fn from_matches(matches: &clap::ArgMatches) -> anyhow::Result<Args> {\n        let data_dir =\n            matches.value_of_os(\"data-dir\").map(PathBuf::from).unwrap();\n        let eval_dir =\n            matches.value_of_os(\"eval-dir\").map(PathBuf::from).unwrap();\n        let similarities = parse_many_lossy(\n            matches,\n            \"sim\",\n            vec![\n                Similarity::None,\n                Similarity::Levenshtein,\n                Similarity::Jaro,\n                Similarity::JaroWinkler,\n            ],\n        )?;\n        let scorers = parse_many_lossy(\n            matches,\n            \"scorer\",\n            vec![\n                OptionalNameScorer::from(NameScorer::OkapiBM25),\n                OptionalNameScorer::from(NameScorer::TFIDF),\n                OptionalNameScorer::from(NameScorer::Jaccard),\n                OptionalNameScorer::from(NameScorer::QueryRatio),\n            ],\n        )?\n        .into_iter()\n        .map(|s| s.0)\n        .collect();\n        let ngram_types =\n            parse_many_lossy(matches, \"ngram-type\", vec![NgramType::Window])?;\n        Ok(Args {\n            data_dir,\n            debug: matches.is_present(\"debug\"),\n            dry_run: matches.is_present(\"dry-run\"),\n            eval_dir,\n            ngram_sizes: parse_many_lossy(matches, \"ngram-size\", vec![3])?,\n            ngram_types,\n            result_sizes: parse_many_lossy(matches, \"result-size\", vec![30])?,\n            scorers,\n            similarities,\n            summarize: matches.value_of_os(\"summarize\").map(PathBuf::from),\n            truth: matches.value_of_os(\"truth\").map(PathBuf::from),\n        })\n    }\n\n    /// Build all evaluation specifications as indicated by command line\n    /// options.\n    fn specs(&self) -> anyhow::Result<Vec<Spec>> {\n        // We want to build all possible permutations. We do this by\n        // alternating between specs1 and specs2. Each additional parameter\n        // combinatorially explodes the previous set of specifications.\n\n        let (mut specs1, mut specs2) = (vec![], vec![]);\n        for &ngram_size in &self.ngram_sizes {\n            specs1.push(Spec::new().with_ngram_size(ngram_size)?);\n        }\n        for spec in specs1.drain(..) {\n            for &result_size in &self.result_sizes {\n                specs2.push(spec.clone().with_result_size(result_size)?);\n            }\n        }\n        for spec in specs2.drain(..) {\n            for sim in &self.similarities {\n                specs1.push(spec.clone().with_similarity(sim.clone()));\n            }\n        }\n        for spec in specs1.drain(..) {\n            for scorer in &self.scorers {\n                specs2.push(spec.clone().with_scorer(scorer.clone()));\n            }\n        }\n        for spec in specs2.drain(..) {\n            for ngram_type in &self.ngram_types {\n                specs1.push(spec.clone().with_ngram_type(ngram_type.clone()));\n            }\n        }\n        Ok(specs1)\n    }\n}\n\nfn app() -> clap::App<'static, 'static> {\n    use clap::{App, AppSettings, Arg};\n\n    lazy_static! {\n        // clap wants all of its strings tied to a particular lifetime, but\n        // we'd really like to determine some default values dynamically. Using\n        // a lazy_static here is one way of safely giving a static lifetime to\n        // a value that is computed at runtime.\n        //\n        // An alternative approach would be to compute all of our default\n        // values in the caller, and pass them into this function. It's nicer\n        // to defined what we need here though. Locality of reference and all\n        // that.\n        static ref DEFAULT_DATA_DIR: PathBuf =\n            env::temp_dir().join(\"imdb-rename\");\n        static ref DEFAULT_EVAL_DIR: PathBuf =\n            env::temp_dir().join(\"imdb-rename-eval\");\n        static ref POSSIBLE_SCORER_NAMES: Vec<&'static str> = {\n            let mut names = NameScorer::possible_names().to_vec();\n            names.insert(0, \"none\");\n            names\n        };\n    }\n\n    App::new(\"imdb-rename\")\n        .author(clap::crate_authors!())\n        .version(clap::crate_version!())\n        .max_term_width(100)\n        .setting(AppSettings::UnifiedHelpMessage)\n        .arg(Arg::with_name(\"data-dir\")\n             .long(\"data-dir\")\n             .env(\"IMDB_RENAME_DATA_DIR\")\n             .takes_value(true)\n             .default_value_os(DEFAULT_DATA_DIR.as_os_str())\n             .help(\"The location to store IMDb data files.\"))\n        .arg(Arg::with_name(\"debug\")\n             .long(\"debug\")\n             .help(\"Show debug messages. Use this when filing bugs.\"))\n        .arg(Arg::with_name(\"dry-run\")\n             .long(\"dry-run\")\n             .help(\"Show the evaluations that would be run and then exit \\\n                    without running them.\"))\n        .arg(Arg::with_name(\"eval-dir\")\n             .long(\"eval-dir\")\n             .env(\"IMDB_RENAME_EVAL_DIR\")\n             .takes_value(true)\n             .default_value_os(DEFAULT_EVAL_DIR.as_os_str())\n             .help(\"The location to store evaluation index files.\"))\n        .arg(Arg::with_name(\"ngram-size\")\n             .long(\"ngram-size\")\n             .takes_value(true)\n             .multiple(true)\n             .number_of_values(1)\n             .help(\"Set the ngram size on which to perform an evaluation. \\\n                    An evaluation will be performed for each ngram size. \\\n                    If no ngram size is given, a default of 3 is used.\"))\n        .arg(Arg::with_name(\"ngram-type\")\n             .long(\"ngram-type\")\n             .takes_value(true)\n             .multiple(true)\n             .number_of_values(1)\n             .possible_values(NgramType::possible_names())\n             .help(\"Set the ngram type on which to perform an evaluation. \\\n                    An evaluation will be performed for each ngram type. \\\n                    If no ngram type is given, it defaults to 'window'.\"))\n        .arg(Arg::with_name(\"result-size\")\n             .long(\"result-size\")\n             .takes_value(true)\n             .multiple(true)\n             .number_of_values(1)\n             .help(\"Set the result size on which to perform an evaluation. \\\n                    An evaluation will be performed for each result size. \\\n                    If no result size is given, a default of 30 is used.\"))\n        .arg(Arg::with_name(\"scorer\")\n             .long(\"scorer\")\n             .takes_value(true)\n             .multiple(true)\n             .number_of_values(1)\n             .possible_values(&POSSIBLE_SCORER_NAMES)\n             .help(\"Set the name scorer function to use. An evaluation is \\\n                    performed for each name function given. By default, \\\n                    all name scorers are used, except for 'none'.\"))\n        .arg(Arg::with_name(\"sim\")\n             .long(\"sim\")\n             .takes_value(true)\n             .multiple(true)\n             .number_of_values(1)\n             .possible_values(Similarity::possible_names())\n             .help(\"Set the similarity ranker function to use. An evaluation \\\n                    is performed for each ranker function given. By default, \\\n                    all ranker functions are used, including 'none'.\"))\n        .arg(Arg::with_name(\"summarize\")\n             .long(\"summarize\")\n             .takes_value(true)\n             .number_of_values(1)\n             .help(\"Print summary statistics from an evaluation run.\"))\n        .arg(Arg::with_name(\"truth\")\n             .long(\"truth\")\n             .takes_value(true)\n             .help(\"A file path containing evaluation truth data. By default, \\\n                    an evaluation uses truth data embedded in imdb-rename.\"))\n}\n\n/// An optional name scorer is a `NameScorer` that may be absent.\n///\n/// We define a type for it to make parsing it easier.\n#[derive(Debug)]\nstruct OptionalNameScorer(Option<NameScorer>);\n\nimpl FromStr for OptionalNameScorer {\n    type Err = imdb_index::Error;\n\n    fn from_str(\n        s: &str,\n    ) -> result::Result<OptionalNameScorer, imdb_index::Error> {\n        let opt = if s == \"none\" { None } else { Some(s.parse()?) };\n        Ok(OptionalNameScorer(opt))\n    }\n}\n\nimpl From<NameScorer> for OptionalNameScorer {\n    fn from(scorer: NameScorer) -> OptionalNameScorer {\n        OptionalNameScorer(Some(scorer))\n    }\n}\n\n/// Parse a sequence of values from clap.\nfn parse_many_lossy<\n    E: std::error::Error + Send + Sync + 'static,\n    T: FromStr<Err = E>,\n>(\n    matches: &clap::ArgMatches,\n    name: &str,\n    default: Vec<T>,\n) -> anyhow::Result<Vec<T>> {\n    let strs = match matches.values_of_lossy(name) {\n        None => return Ok(default),\n        Some(strs) => strs,\n    };\n    let mut values = vec![];\n    for s in strs {\n        values.push(s.parse()?);\n    }\n    Ok(values)\n}\n\n/// Return true if and only if an I/O broken pipe error exists in the causal\n/// chain of the given error.\nfn is_pipe_error(err: &anyhow::Error) -> bool {\n    for cause in err.chain() {\n        if let Some(ioerr) = cause.downcast_ref::<io::Error>() {\n            if ioerr.kind() == io::ErrorKind::BrokenPipe {\n                return true;\n            }\n        }\n    }\n    false\n}\n"
  },
  {
    "path": "imdb-index/COPYING",
    "content": "This project is dual-licensed under the Unlicense and MIT licenses.\n\nYou may use this code under the terms of either license.\n"
  },
  {
    "path": "imdb-index/Cargo.toml",
    "content": "[package]\nname = \"imdb-index\"\nversion = \"0.1.4\"  #:version\nauthors = [\"Andrew Gallant <jamslam@gmail.com>\"]\ndescription = \"\"\"\nA library for indexing and searching IMDb using information retrieval.\n\"\"\"\ndocumentation = \"https://github.com/BurntSushi/imdb-rename\"\nhomepage = \"https://github.com/BurntSushi/imdb-rename\"\nrepository = \"https://github.com/BurntSushi/imdb-rename\"\nreadme = \"README.md\"\nkeywords = [\"imdb\", \"movie\", \"index\", \"search\"]\nlicense = \"Unlicense/MIT\"\nedition = \"2021\"\n\n[dependencies]\ncsv = \"1.3.0\"\nfnv = \"1.0.7\"\nfst = \"0.4.7\"\nlazy_static = \"1.4.0\"\nlog = { version = \"0.4.20\", features = [\"std\"] }\nmemmap = { package = \"memmap2\", version = \"0.9.1\" }\nregex = \"1.10.2\"\nserde = { version = \"1.0.193\", features = [\"derive\"] }\nserde_json = \"1.0.108\"\nstrsim = \"0.10.0\"\n"
  },
  {
    "path": "imdb-index/LICENSE-MIT",
    "content": "The MIT License (MIT)\n\nCopyright (c) 2015 Andrew Gallant\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in\nall copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\nTHE SOFTWARE.\n"
  },
  {
    "path": "imdb-index/README.md",
    "content": "imdb-index\n==========\nA library for reading and writing an IMDb index, with a focus on IMDb titles.\nIn particular, this library can build a name index on all of IMDb's 6 million\nnames, which supports fast fuzzy searching and relevance ranking.\n\n[![Linux build status](https://api.travis-ci.org/BurntSushi/imdb-rename.png)](https://travis-ci.org/BurntSushi/imdb-rename)\n[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/imdb-rename?svg=true)](https://ci.appveyor.com/project/BurntSushi/imdb-rename)\n[![](http://meritbadge.herokuapp.com/imdb-rename)](https://crates.io/crates/imdb-index)\n\nDual-licensed under MIT or the [UNLICENSE](http://unlicense.org).\n\n\n### Documentation\n\nhttps://docs.rs/imdb-index\n"
  },
  {
    "path": "imdb-index/UNLICENSE",
    "content": "This is free and unencumbered software released into the public domain.\n\nAnyone is free to copy, modify, publish, use, compile, sell, or\ndistribute this software, either in source code form or as a compiled\nbinary, for any purpose, commercial or non-commercial, and by any\nmeans.\n\nIn jurisdictions that recognize copyright laws, the author or authors\nof this software dedicate any and all copyright interest in the\nsoftware to the public domain. We make this dedication for the benefit\nof the public at large and to the detriment of our heirs and\nsuccessors. We intend this dedication to be an overt act of\nrelinquishment in perpetuity of all present and future rights to this\nsoftware under copyright law.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\nEXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\nIN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR\nOTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,\nARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\nOTHER DEALINGS IN THE SOFTWARE.\n\nFor more information, please refer to <http://unlicense.org/>\n"
  },
  {
    "path": "imdb-index/src/error.rs",
    "content": "use std::fmt;\nuse std::path::{Path, PathBuf};\n\n/// A type alias for handling errors throughout imdb-index.\npub type Result<T> = std::result::Result<T, Error>;\n\n/// An error that can occur while interacting with an IMDb index.\n#[derive(Debug)]\npub struct Error {\n    kind: ErrorKind,\n}\n\nimpl Error {\n    /// Return a reference to the kind of this error.\n    pub fn kind(&self) -> &ErrorKind {\n        &self.kind\n    }\n\n    /// Transfer ownership of the kind of this error.\n    pub fn into_kind(self) -> ErrorKind {\n        self.kind\n    }\n\n    pub(crate) fn new(kind: ErrorKind) -> Error {\n        Error { kind }\n    }\n\n    pub(crate) fn unknown_title<T: AsRef<str>>(unk: T) -> Error {\n        Error { kind: ErrorKind::UnknownTitle(unk.as_ref().to_string()) }\n    }\n\n    pub(crate) fn unknown_scorer<T: AsRef<str>>(unk: T) -> Error {\n        Error { kind: ErrorKind::UnknownScorer(unk.as_ref().to_string()) }\n    }\n\n    pub(crate) fn unknown_ngram_type<T: AsRef<str>>(unk: T) -> Error {\n        Error { kind: ErrorKind::UnknownNgramType(unk.as_ref().to_string()) }\n    }\n\n    pub(crate) fn unknown_sim<T: AsRef<str>>(unk: T) -> Error {\n        Error { kind: ErrorKind::UnknownSimilarity(unk.as_ref().to_string()) }\n    }\n\n    pub(crate) fn unknown_directive<T: AsRef<str>>(unk: T) -> Error {\n        Error { kind: ErrorKind::UnknownDirective(unk.as_ref().to_string()) }\n    }\n\n    pub(crate) fn bug<T: AsRef<str>>(msg: T) -> Error {\n        Error { kind: ErrorKind::Bug(msg.as_ref().to_string()) }\n    }\n\n    pub(crate) fn config<T: AsRef<str>>(msg: T) -> Error {\n        Error { kind: ErrorKind::Config(msg.as_ref().to_string()) }\n    }\n\n    pub(crate) fn version(expected: u64, got: u64) -> Error {\n        Error { kind: ErrorKind::VersionMismatch { expected, got } }\n    }\n\n    pub(crate) fn csv(err: csv::Error) -> Error {\n        Error { kind: ErrorKind::Csv(err.to_string()) }\n    }\n\n    pub(crate) fn fst(err: fst::Error) -> Error {\n        Error { kind: ErrorKind::Fst(err.to_string()) }\n    }\n\n    pub(crate) fn io(err: std::io::Error) -> Error {\n        Error { kind: ErrorKind::Io { err, path: None } }\n    }\n\n    pub(crate) fn io_path<P: AsRef<Path>>(\n        err: std::io::Error,\n        path: P,\n    ) -> Error {\n        Error {\n            kind: ErrorKind::Io {\n                err,\n                path: Some(path.as_ref().to_path_buf()),\n            },\n        }\n    }\n\n    pub(crate) fn number<E: std::error::Error + Send + Sync + 'static>(\n        err: E,\n    ) -> Error {\n        Error { kind: ErrorKind::Number(Box::new(err)) }\n    }\n}\n\nimpl std::error::Error for Error {\n    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {\n        match self.kind {\n            ErrorKind::Io { ref err, .. } => Some(err),\n            ErrorKind::Number(ref err) => Some(&**err),\n            _ => None,\n        }\n    }\n}\n\nimpl fmt::Display for Error {\n    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {\n        self.kind.fmt(f)\n    }\n}\n\n/// The specific kind of error that can occur.\n#[derive(Debug)]\npub enum ErrorKind {\n    /// An index version mismatch. This error occurs when the version of the\n    /// index is different from the version supported by this version of\n    /// imdb-index.\n    ///\n    /// Generally speaking, the versions must be exactly equivalent, otherwise\n    /// this error is returned.\n    VersionMismatch {\n        /// The expected or supported index version.\n        expected: u64,\n        /// The actual version of the index on disk.\n        got: u64,\n    },\n    /// An error parsing the type of a title.\n    ///\n    /// The data provided is the unrecognized title type.\n    UnknownTitle(String),\n    /// An error parsing the name of a scorer.\n    ///\n    /// The data provided is the unrecognized name.\n    UnknownScorer(String),\n    /// An error parsing the name of an ngram type.\n    ///\n    /// The data provided is the unrecognized name.\n    UnknownNgramType(String),\n    /// An error parsing the name of a similarity function.\n    ///\n    /// The data provided is the unrecognized name.\n    UnknownSimilarity(String),\n    /// An error parsing the name of a directive from a free-form query.\n    ///\n    /// The data provided is the unrecognized name.\n    UnknownDirective(String),\n    /// An unexpected error occurred while reading an index that should not\n    /// have occurred. Generally, these errors correspond to bugs in this\n    /// library.\n    Bug(String),\n    /// An error occurred while reading/writing the index config.\n    Config(String),\n    /// An error that occured while writing or reading CSV data.\n    Csv(String),\n    /// An error that occured while creating an FST index.\n    Fst(String),\n    /// An unexpected I/O error occurred.\n    Io {\n        /// The underlying I/O error.\n        err: std::io::Error,\n        /// A file path, if the I/O error occurred in the context of a named\n        /// file.\n        path: Option<PathBuf>,\n    },\n    /// An error occurred while parsing a number in a free-form query.\n    Number(Box<dyn std::error::Error + Send + Sync>),\n    /// Hints that destructuring should not be exhaustive.\n    ///\n    /// This enum may grow additional variants, so this makes sure clients\n    /// don't count on exhaustive matching. (Otherwise, adding a new variant\n    /// could break existing code.)\n    #[doc(hidden)]\n    __Nonexhaustive,\n}\n\nimpl fmt::Display for ErrorKind {\n    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {\n        match *self {\n            ErrorKind::VersionMismatch { expected, got } => write!(\n                f,\n                \"index version mismatch: expected version {} \\\n                           but got version {}. Please rebuild the index.\",\n                expected, got\n            ),\n            ErrorKind::UnknownTitle(ref unk) => {\n                write!(f, \"unrecognized title type: '{}'\", unk)\n            }\n            ErrorKind::UnknownScorer(ref unk) => {\n                write!(f, \"unrecognized scorer name: '{}'\", unk)\n            }\n            ErrorKind::UnknownNgramType(ref unk) => {\n                write!(f, \"unrecognized ngram type: '{}'\", unk)\n            }\n            ErrorKind::UnknownSimilarity(ref unk) => {\n                write!(f, \"unrecognized similarity function: '{}'\", unk)\n            }\n            ErrorKind::UnknownDirective(ref unk) => {\n                write!(f, \"unrecognized search directive: '{}'\", unk)\n            }\n            ErrorKind::Bug(ref msg) => {\n                let report = \"Please report this bug with a backtrace at \\\n                              https://github.com/BurntSushi/imdb-rename\";\n                write!(f, \"BUG: {}\\n{}\", msg, report)\n            }\n            ErrorKind::Config(ref msg) => write!(f, \"config error: {}\", msg),\n            ErrorKind::Csv(ref msg) => write!(f, \"{}\", msg),\n            ErrorKind::Fst(ref msg) => write!(f, \"fst error: {}\", msg),\n            ErrorKind::Io { path: None, .. } => write!(f, \"I/O error\"),\n            ErrorKind::Io { path: Some(ref p), .. } => {\n                write!(f, \"{}\", p.display())\n            }\n            ErrorKind::Number(_) => write!(f, \"error parsing number\"),\n            ErrorKind::__Nonexhaustive => panic!(\"invalid error\"),\n        }\n    }\n}\n"
  },
  {
    "path": "imdb-index/src/index/aka.rs",
    "content": "use std::io;\nuse std::iter;\nuse std::path::Path;\n\nuse memmap::Mmap;\n\nuse crate::error::{Error, Result};\nuse crate::index::{csv_file, csv_mmap, id};\nuse crate::record::AKA;\nuse crate::util::IMDB_AKAS;\n\n/// A name of the AKA record index file.\n///\n/// This index represents a map from IMDb title id to a 64-bit integer. The\n/// 64-bit integer encodes two pieces of information: the number of alternate\n/// names for the title (high 16 bits) and the file offset at which the records\n/// appear in title.akas.tsv (low 48 bits).\nconst AKAS: &str = \"akas.fst\";\n\n/// A handle to the AKA name index.\n///\n/// The AKA index maps IMDb identifiers to a list of AKA records.\n///\n/// This index assumes that the underlying AKA CSV file is sorted by IMDb ID.\n#[derive(Debug)]\npub struct Index {\n    akas: csv::Reader<io::Cursor<Mmap>>,\n    idx: id::IndexReader,\n}\n\nimpl Index {\n    /// Open an AKA index using the corresponding data and index directories.\n    /// The data directory contains the IMDb data set while the index directory\n    /// contains the index data files.\n    pub fn open<P1: AsRef<Path>, P2: AsRef<Path>>(\n        data_dir: P1,\n        index_dir: P2,\n    ) -> Result<Index> {\n        Ok(Index {\n            // We claim it is safe to open the following memory map because we\n            // don't mutate them and no other process (should) either.\n            akas: unsafe { csv_mmap(data_dir.as_ref().join(IMDB_AKAS))? },\n            idx: id::IndexReader::from_path(index_dir.as_ref().join(AKAS))?,\n        })\n    }\n\n    /// Create an AKA index by reading the AKA data from the given data\n    /// directory and writing the index to the corresponding index directory.\n    pub fn create<P1: AsRef<Path>, P2: AsRef<Path>>(\n        data_dir: P1,\n        index_dir: P2,\n    ) -> Result<Index> {\n        let data_dir = data_dir.as_ref();\n        let index_dir = index_dir.as_ref();\n\n        let rdr = csv_file(data_dir.join(IMDB_AKAS))?;\n        let mut wtr = id::IndexSortedWriter::from_path(index_dir.join(AKAS))?;\n        let mut count = 0u64;\n        for result in AKAIndexRecords::new(rdr) {\n            let record = result?;\n            wtr.insert(&record.id, (record.count << 48) | record.offset)?;\n            count += record.count;\n        }\n        wtr.finish()?;\n\n        log::info!(\"{} alternate names indexed\", count);\n        Index::open(data_dir, index_dir)\n    }\n\n    /// Return a (possibly empty) iterator over all AKA records for the given\n    /// IMDb ID.\n    pub fn find(&mut self, id: &[u8]) -> Result<AKARecordIter> {\n        match self.idx.get(id) {\n            None => Ok(AKARecordIter(None)),\n            Some(v) => {\n                let count = (v >> 48) as usize;\n                let offset = v & ((1 << 48) - 1);\n\n                let mut pos = csv::Position::new();\n                pos.set_byte(offset);\n                self.akas.seek(pos).map_err(Error::csv)?;\n\n                Ok(AKARecordIter(Some(self.akas.deserialize().take(count))))\n            }\n        }\n    }\n}\n\n/// An iterator over AKA records for a single IMDb title.\n///\n/// This iterator is constructed via the `aka::Index::find` method.\n///\n/// This iterator may yield no titles.\n///\n/// The lifetime `'r` refers to the lifetime of the underlying AKA index\n/// reader.\npub struct AKARecordIter<'r>(\n    Option<iter::Take<csv::DeserializeRecordsIter<'r, io::Cursor<Mmap>, AKA>>>,\n);\n\nimpl<'r> Iterator for AKARecordIter<'r> {\n    type Item = Result<AKA>;\n\n    fn next(&mut self) -> Option<Result<AKA>> {\n        let next = match self.0.as_mut().and_then(|it| it.next()) {\n            None => return None,\n            Some(next) => next,\n        };\n        match next {\n            Ok(next) => Some(Ok(next)),\n            Err(err) => Some(Err(Error::csv(err))),\n        }\n    }\n}\n\n/// An indexable AKA record.\n///\n/// Each indexable record represents a group of alternative titles in the\n/// title.akas.tsv file.\n#[derive(Clone, Debug, Eq, PartialEq)]\nstruct AKAIndexRecord {\n    id: Vec<u8>,\n    offset: u64,\n    count: u64,\n}\n\n/// A streaming iterator over indexable AKA records.\n///\n/// Each indexable record is a triple, and consists of an IMDb title ID,\n/// the number of alternate titles for that title, and the file offset in the\n/// CSV file at which those records begin.\n///\n/// The `R` type parameter refers to the underlying `io::Read` type of the\n/// CSV reader.\n#[derive(Debug)]\nstruct AKAIndexRecords<R> {\n    /// The underlying CSV reader.\n    rdr: csv::Reader<R>,\n    /// Scratch space for storing the byte record.\n    record: csv::ByteRecord,\n    /// Set to true when the iterator has been exhausted.\n    done: bool,\n}\n\nimpl<R: io::Read> AKAIndexRecords<R> {\n    /// Create a new streaming iterator over indexable AKA records.\n    fn new(rdr: csv::Reader<R>) -> AKAIndexRecords<R> {\n        AKAIndexRecords { rdr, record: csv::ByteRecord::new(), done: false }\n    }\n}\n\nimpl<R: io::Read> Iterator for AKAIndexRecords<R> {\n    type Item = Result<AKAIndexRecord>;\n\n    /// Advance to the next indexable record and return it. If no more\n    /// records exist, return `None`.\n    ///\n    /// If there was a problem parsing or reading from the underlying CSV\n    /// data, then an error is returned.\n    fn next(&mut self) -> Option<Result<AKAIndexRecord>> {\n        macro_rules! itry {\n            ($e:expr) => {\n                match $e {\n                    Err(err) => return Some(Err(Error::csv(err))),\n                    Ok(v) => v,\n                }\n            };\n        }\n\n        if self.done {\n            return None;\n        }\n        // Only initialize the record if this is our first go at it.\n        // Otherwise, previous call leaves next record in `AKAIndexRecord`.\n        if self.record.is_empty() {\n            if !itry!(self.rdr.read_byte_record(&mut self.record)) {\n                return None;\n            }\n        }\n        let mut irecord = AKAIndexRecord {\n            id: self.record[0].to_vec(),\n            offset: self.record.position().expect(\"position on row\").byte(),\n            count: 1,\n        };\n        while itry!(self.rdr.read_byte_record(&mut self.record)) {\n            if irecord.id != &self.record[0] {\n                break;\n            }\n            irecord.count += 1;\n        }\n        // If we've read the last record then we're done!\n        if self.rdr.is_done() {\n            self.done = true;\n        }\n        Some(Ok(irecord))\n    }\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    use crate::util::csv_reader_builder;\n\n    #[test]\n    fn aka_index_records1() {\n        let data = r\"titleId\tordering\ttitle\tregion\tlanguage\ttypes\tattributes\tisOriginalTitle\ntt0117019\t1\tHommes à l'huile\tFR\t\\N\t\\N\t\\N\t0\ntt0117019\t2\tMänner in Öl\tDE\t\\N\t\\N\t\\N\t0\ntt0117019\t3\tMen in Oil\tXEU\ten\tfestival\t\\N\t0\ntt0117019\t4\tMänner in Öl: Annäherungsversuche an die Malerin Susanne Hay\t\\N\t\\N\toriginal\t\\N\t1\ntt0117019\t5\tMen in Oil\tXWW\ten\t\\N\t\\N\t0\ntt0117020\t1\tMendigos sin fronteras\tES\t\\N\t\\N\t\\N\t0\ntt0117021\t1\tMenno's Mind\tUS\t\\N\t\\N\t\\N\t0\ntt0117021\t2\tMenno's Mind\t\\N\t\\N\toriginal\t\\N\t1\ntt0117021\t3\tThe Matrix 2\tRU\t\\N\tvideo\t\\N\t0\ntt0117021\t4\tVirtuális elme\tHU\t\\N\timdbDisplay\t\\N\t0\ntt0117021\t5\tPower.com\tUS\t\\N\tvideo\t\\N\t0\ntt0117021\t6\tLa mente de Menno\tES\t\\N\t\\N\t\\N\t0\ntt0117021\t7\tPower.com\tCA\ten\tvideo\t\\N\t0\ntt0117021\t8\tTerror im Computer\tDE\t\\N\t\\N\t\\N\t0\ntt0117022\t1\tMenopause Song\tCA\t\\N\t\\N\t\\N\t0\ntt0117023\t1\tLes menteurs\tFR\t\\N\t\\N\t\\N\t0\";\n        let rdr = csv_reader_builder().from_reader(data.as_bytes());\n        let records: Vec<AKAIndexRecord> =\n            AKAIndexRecords::new(rdr).collect::<Result<_>>().unwrap();\n        assert_eq!(records.len(), 5);\n\n        assert_eq!(records[0].id, b\"tt0117019\");\n        assert_eq!(records[0].count, 5);\n\n        assert_eq!(records[1].id, b\"tt0117020\");\n        assert_eq!(records[1].count, 1);\n\n        assert_eq!(records[2].id, b\"tt0117021\");\n        assert_eq!(records[2].count, 8);\n\n        assert_eq!(records[3].id, b\"tt0117022\");\n        assert_eq!(records[3].count, 1);\n\n        assert_eq!(records[4].id, b\"tt0117023\");\n        assert_eq!(records[4].count, 1);\n    }\n\n    #[test]\n    fn aka_index_records2() {\n        let data = r\"titleId\tordering\ttitle\tregion\tlanguage\ttypes\tattributes\tisOriginalTitle\ntt0117019\t1\tHommes à l'huile\tFR\t\\N\t\\N\t\\N\t0\ntt0117019\t2\tMänner in Öl\tDE\t\\N\t\\N\t\\N\t0\ntt0117019\t3\tMen in Oil\tXEU\ten\tfestival\t\\N\t0\ntt0117019\t4\tMänner in Öl: Annäherungsversuche an die Malerin Susanne Hay\t\\N\t\\N\toriginal\t\\N\t1\ntt0117019\t5\tMen in Oil\tXWW\ten\t\\N\t\\N\t0\ntt0117020\t1\tMendigos sin fronteras\tES\t\\N\t\\N\t\\N\t0\ntt0117021\t1\tMenno's Mind\tUS\t\\N\t\\N\t\\N\t0\ntt0117021\t2\tMenno's Mind\t\\N\t\\N\toriginal\t\\N\t1\ntt0117021\t3\tThe Matrix 2\tRU\t\\N\tvideo\t\\N\t0\ntt0117021\t4\tVirtuális elme\tHU\t\\N\timdbDisplay\t\\N\t0\ntt0117021\t5\tPower.com\tUS\t\\N\tvideo\t\\N\t0\ntt0117021\t6\tLa mente de Menno\tES\t\\N\t\\N\t\\N\t0\ntt0117021\t7\tPower.com\tCA\ten\tvideo\t\\N\t0\ntt0117021\t8\tTerror im Computer\tDE\t\\N\t\\N\t\\N\t0\";\n        let rdr = csv_reader_builder().from_reader(data.as_bytes());\n        let records: Vec<AKAIndexRecord> =\n            AKAIndexRecords::new(rdr).collect::<Result<_>>().unwrap();\n        assert_eq!(records.len(), 3);\n\n        assert_eq!(records[0].id, b\"tt0117019\");\n        assert_eq!(records[0].count, 5);\n\n        assert_eq!(records[1].id, b\"tt0117020\");\n        assert_eq!(records[1].count, 1);\n\n        assert_eq!(records[2].id, b\"tt0117021\");\n        assert_eq!(records[2].count, 8);\n    }\n\n    #[test]\n    fn aka_index_records3() {\n        let data = r\"titleId\tordering\ttitle\tregion\tlanguage\ttypes\tattributes\tisOriginalTitle\ntt0117021\t1\tMenno's Mind\tUS\t\\N\t\\N\t\\N\t0\ntt0117021\t2\tMenno's Mind\t\\N\t\\N\toriginal\t\\N\t1\ntt0117021\t3\tThe Matrix 2\tRU\t\\N\tvideo\t\\N\t0\ntt0117021\t4\tVirtuális elme\tHU\t\\N\timdbDisplay\t\\N\t0\ntt0117021\t5\tPower.com\tUS\t\\N\tvideo\t\\N\t0\ntt0117021\t6\tLa mente de Menno\tES\t\\N\t\\N\t\\N\t0\ntt0117021\t7\tPower.com\tCA\ten\tvideo\t\\N\t0\ntt0117021\t8\tTerror im Computer\tDE\t\\N\t\\N\t\\N\t0\";\n        let rdr = csv_reader_builder().from_reader(data.as_bytes());\n        let records: Vec<AKAIndexRecord> =\n            AKAIndexRecords::new(rdr).collect::<Result<_>>().unwrap();\n        assert_eq!(records.len(), 1);\n\n        assert_eq!(records[0].id, b\"tt0117021\");\n        assert_eq!(records[0].count, 8);\n    }\n\n    #[test]\n    fn aka_index_records4() {\n        let data = r\"titleId\tordering\ttitle\tregion\tlanguage\ttypes\tattributes\tisOriginalTitle\ntt0117021\t1\tMenno's Mind\tUS\t\\N\t\\N\t\\N\t0\";\n        let rdr = csv_reader_builder().from_reader(data.as_bytes());\n        let records: Vec<AKAIndexRecord> =\n            AKAIndexRecords::new(rdr).collect::<Result<_>>().unwrap();\n        assert_eq!(records.len(), 1);\n\n        assert_eq!(records[0].id, b\"tt0117021\");\n        assert_eq!(records[0].count, 1);\n    }\n}\n"
  },
  {
    "path": "imdb-index/src/index/episode.rs",
    "content": "use std::cmp;\nuse std::path::Path;\nuse std::u32;\n\nuse fst::{IntoStreamer, Streamer};\nuse memmap::Mmap;\n\nuse crate::error::{Error, Result};\nuse crate::index::csv_file;\nuse crate::record::Episode;\nuse crate::util::{fst_set_builder_file, fst_set_file, IMDB_EPISODE};\n\n/// The name of the episode index file.\n///\n/// The episode index maps TV show ids to episodes. The index is constructed\n/// in a way where either of the following things can be used as look up keys:\n///\n///   tvshow IMDb title ID\n///   (tvshow IMDb title ID, season number)\n///\n/// In particular, the index itself stores the entire episode record, and it\n/// can be re-constituted without re-visiting the original episode data file.\nconst SEASONS: &str = \"episode.seasons.fst\";\n\n/// The name of the TV show index file.\n///\n/// The TV show index maps episode IMDb title IDs to tvshow IMDb title IDs.\n/// This allows us to quickly look up the TV show corresponding to an episode\n/// in search results.\n///\n/// The format of this index is an FST set, where each key corresponds to the\n/// episode ID joined with the TV show ID by a `NUL` byte. This lets us do\n/// a range query on the set when given the episode ID to find the TV show ID.\nconst TVSHOWS: &str = \"episode.tvshows.fst\";\n\n/// An episode index that supports retrieving season and episode information\n/// quickly.\n#[derive(Debug)]\npub struct Index {\n    seasons: fst::Set<Mmap>,\n    tvshows: fst::Set<Mmap>,\n}\n\nimpl Index {\n    /// Open an episode index from the given index directory.\n    pub fn open<P: AsRef<Path>>(index_dir: P) -> Result<Index> {\n        let index_dir = index_dir.as_ref();\n        // We claim it is safe to open the following memory map because we\n        // don't mutate them and no other process (should) either.\n        let seasons = unsafe { fst_set_file(index_dir.join(SEASONS))? };\n        let tvshows = unsafe { fst_set_file(index_dir.join(TVSHOWS))? };\n        Ok(Index { seasons, tvshows })\n    }\n\n    /// Create an episode index from the given IMDb data directory and write\n    /// it to the given index directory. If an episode index already exists,\n    /// then it is overwritten.\n    pub fn create<P1: AsRef<Path>, P2: AsRef<Path>>(\n        data_dir: P1,\n        index_dir: P2,\n    ) -> Result<Index> {\n        let data_dir = data_dir.as_ref();\n        let index_dir = index_dir.as_ref();\n\n        let mut buf = vec![];\n        let mut seasons = fst_set_builder_file(index_dir.join(SEASONS))?;\n        let mut tvshows = fst_set_builder_file(index_dir.join(TVSHOWS))?;\n\n        let mut episodes = read_sorted_episodes(data_dir)?;\n        for episode in &episodes {\n            buf.clear();\n            write_episode(episode, &mut buf)?;\n            seasons.insert(&buf).map_err(Error::fst)?;\n        }\n\n        episodes.sort_by(|e1, e2| {\n            (&e1.id, &e1.tvshow_id).cmp(&(&e2.id, &e2.tvshow_id))\n        });\n        for episode in &episodes {\n            buf.clear();\n            write_tvshow(&episode, &mut buf)?;\n            tvshows.insert(&buf).map_err(Error::fst)?;\n        }\n\n        seasons.finish().map_err(Error::fst)?;\n        tvshows.finish().map_err(Error::fst)?;\n\n        log::info!(\"{} episodes indexed\", episodes.len());\n        Index::open(index_dir)\n    }\n\n    /// Return a sequence of episodes for the given TV show IMDb identifier.\n    ///\n    /// The episodes are sorted in order of season number and episode number.\n    /// Episodes without season/episode numbers are sorted after episodes with\n    /// numbers.\n    pub fn seasons(&self, tvshow_id: &[u8]) -> Result<Vec<Episode>> {\n        let mut upper = tvshow_id.to_vec();\n        upper.push(0xFF);\n\n        let mut episodes = vec![];\n        let mut stream =\n            self.seasons.range().ge(tvshow_id).le(upper).into_stream();\n        while let Some(episode_bytes) = stream.next() {\n            episodes.push(read_episode(episode_bytes)?);\n        }\n        Ok(episodes)\n    }\n\n    /// Return a sequence of episodes for the given TV show IMDb identifier and\n    /// season number.\n    ///\n    /// The episodes are sorted in order of episode number. Episodes without\n    /// episode numbers are sorted after episodes with numbers.\n    pub fn episodes(\n        &self,\n        tvshow_id: &[u8],\n        season: u32,\n    ) -> Result<Vec<Episode>> {\n        let mut lower = tvshow_id.to_vec();\n        lower.push(0x00);\n        lower.extend_from_slice(&season.to_be_bytes());\n        lower.extend_from_slice(&0u32.to_be_bytes());\n\n        let mut upper = tvshow_id.to_vec();\n        upper.push(0x00);\n        upper.extend_from_slice(&season.to_be_bytes());\n        upper.extend_from_slice(&u32::MAX.to_be_bytes());\n\n        let mut episodes = vec![];\n        let mut stream =\n            self.seasons.range().ge(lower).le(upper).into_stream();\n        while let Some(episode_bytes) = stream.next() {\n            episodes.push(read_episode(episode_bytes)?);\n        }\n        Ok(episodes)\n    }\n\n    /// Return the episode information for the given episode IMDb identifier.\n    ///\n    /// If no episode information for the given ID exists, then `None` is\n    /// returned.\n    pub fn episode(&self, episode_id: &[u8]) -> Result<Option<Episode>> {\n        let mut upper = episode_id.to_vec();\n        upper.push(0xFF);\n\n        let mut stream =\n            self.tvshows.range().ge(episode_id).le(upper).into_stream();\n        while let Some(tvshow_bytes) = stream.next() {\n            return Ok(Some(read_tvshow(tvshow_bytes)?));\n        }\n        Ok(None)\n    }\n}\n\nfn read_sorted_episodes(data_dir: &Path) -> Result<Vec<Episode>> {\n    // We claim it is safe to open the following memory map because we don't\n    // mutate them and no other process (should) either.\n    let mut rdr = csv_file(data_dir.join(IMDB_EPISODE))?;\n    let mut records = vec![];\n    for result in rdr.deserialize() {\n        let record: Episode = result.map_err(Error::csv)?;\n        records.push(record);\n    }\n    records.sort_by(cmp_episode);\n    Ok(records)\n}\n\nfn cmp_episode(ep1: &Episode, ep2: &Episode) -> cmp::Ordering {\n    let k1 = (\n        &ep1.tvshow_id,\n        ep1.season.unwrap_or(u32::MAX),\n        ep1.episode.unwrap_or(u32::MAX),\n        &ep1.id,\n    );\n    let k2 = (\n        &ep2.tvshow_id,\n        ep2.season.unwrap_or(u32::MAX),\n        ep2.episode.unwrap_or(u32::MAX),\n        &ep2.id,\n    );\n    k1.cmp(&k2)\n}\n\nfn read_episode(bytes: &[u8]) -> Result<Episode> {\n    let nul = match bytes.iter().position(|&b| b == 0) {\n        Some(nul) => nul,\n        None => bug!(\"could not find nul byte\"),\n    };\n    let tvshow_id = match String::from_utf8(bytes[..nul].to_vec()) {\n        Err(err) => bug!(\"tvshow_id invalid UTF-8: {}\", err),\n        Ok(tvshow_id) => tvshow_id,\n    };\n\n    let mut i = nul + 1;\n    let season = from_optional_u32(\"season\", &bytes[i..])?;\n\n    i += 4;\n    let episode = from_optional_u32(\"episode number\", &bytes[i..])?;\n\n    i += 4;\n    let id = match String::from_utf8(bytes[i..].to_vec()) {\n        Err(err) => bug!(\"episode id invalid UTF-8: {}\", err),\n        Ok(id) => id,\n    };\n    Ok(Episode { id, tvshow_id, season, episode })\n}\n\nfn write_episode(ep: &Episode, buf: &mut Vec<u8>) -> Result<()> {\n    if ep.tvshow_id.as_bytes().iter().any(|&b| b == 0) {\n        bug!(\"unsupported tvshow id (with NUL byte) for {:?}\", ep);\n    }\n    buf.extend_from_slice(ep.tvshow_id.as_bytes());\n    buf.push(0x00);\n    buf.extend_from_slice(&to_optional_season(ep)?.to_be_bytes());\n    buf.extend_from_slice(&to_optional_epnum(ep)?.to_be_bytes());\n    buf.extend_from_slice(ep.id.as_bytes());\n    Ok(())\n}\n\nfn read_tvshow(bytes: &[u8]) -> Result<Episode> {\n    let nul = match bytes.iter().position(|&b| b == 0) {\n        Some(nul) => nul,\n        None => bug!(\"could not find nul byte\"),\n    };\n    let id = match String::from_utf8(bytes[..nul].to_vec()) {\n        Err(err) => bug!(\"episode id invalid UTF-8: {}\", err),\n        Ok(tvshow_id) => tvshow_id,\n    };\n\n    let mut i = nul + 1;\n    let season = from_optional_u32(\"season\", &bytes[i..])?;\n\n    i += 4;\n    let episode = from_optional_u32(\"episode number\", &bytes[i..])?;\n\n    i += 4;\n    let tvshow_id = match String::from_utf8(bytes[i..].to_vec()) {\n        Err(err) => bug!(\"tvshow_id invalid UTF-8: {}\", err),\n        Ok(tvshow_id) => tvshow_id,\n    };\n    Ok(Episode { id, tvshow_id, season, episode })\n}\n\nfn write_tvshow(ep: &Episode, buf: &mut Vec<u8>) -> Result<()> {\n    if ep.id.as_bytes().iter().any(|&b| b == 0) {\n        bug!(\"unsupported episode id (with NUL byte) for {:?}\", ep);\n    }\n\n    buf.extend_from_slice(ep.id.as_bytes());\n    buf.push(0x00);\n    buf.extend_from_slice(&to_optional_season(ep)?.to_be_bytes());\n    buf.extend_from_slice(&to_optional_epnum(ep)?.to_be_bytes());\n    buf.extend_from_slice(ep.tvshow_id.as_bytes());\n    Ok(())\n}\n\nfn from_optional_u32(\n    label: &'static str,\n    bytes: &[u8],\n) -> Result<Option<u32>> {\n    if bytes.len() < 4 {\n        bug!(\"not enough bytes to read optional {}\", label);\n    }\n    Ok(match u32::from_be_bytes(bytes[..4].try_into().unwrap()) {\n        u32::MAX => None,\n        x => Some(x),\n    })\n}\n\nfn to_optional_season(ep: &Episode) -> Result<u32> {\n    match ep.season {\n        None => Ok(u32::MAX),\n        Some(x) => {\n            if x == u32::MAX {\n                bug!(\"unsupported season number {} for {:?}\", x, ep);\n            }\n            Ok(x)\n        }\n    }\n}\n\nfn to_optional_epnum(ep: &Episode) -> Result<u32> {\n    match ep.episode {\n        None => Ok(u32::MAX),\n        Some(x) => {\n            if x == u32::MAX {\n                bug!(\"unsupported episode number {} for {:?}\", x, ep);\n            }\n            Ok(x)\n        }\n    }\n}\n\n#[cfg(test)]\nmod tests {\n    use super::Index;\n    use crate::index::tests::TestContext;\n    use std::collections::HashMap;\n\n    #[test]\n    fn basics() {\n        let ctx = TestContext::new(\"small\");\n        let idx = Index::create(ctx.data_dir(), ctx.index_dir()).unwrap();\n        let eps = idx.seasons(b\"tt0096697\").unwrap();\n\n        let mut counts: HashMap<u32, u32> = HashMap::new();\n        for ep in eps {\n            *counts.entry(ep.season.unwrap()).or_insert(0) += 1;\n        }\n        assert_eq!(counts.len(), 3);\n        assert_eq!(counts[&1], 13);\n        assert_eq!(counts[&2], 22);\n        assert_eq!(counts[&3], 24);\n    }\n\n    #[test]\n    fn by_season() {\n        let ctx = TestContext::new(\"small\");\n        let idx = Index::create(ctx.data_dir(), ctx.index_dir()).unwrap();\n        let eps = idx.episodes(b\"tt0096697\", 2).unwrap();\n\n        let mut counts: HashMap<u32, u32> = HashMap::new();\n        for ep in eps {\n            *counts.entry(ep.season.unwrap()).or_insert(0) += 1;\n        }\n        println!(\"{:?}\", counts);\n        assert_eq!(counts.len(), 1);\n        assert_eq!(counts[&2], 22);\n    }\n\n    #[test]\n    fn tvshow() {\n        let ctx = TestContext::new(\"small\");\n        let idx = Index::create(ctx.data_dir(), ctx.index_dir()).unwrap();\n        let ep = idx.episode(b\"tt0701063\").unwrap().unwrap();\n        assert_eq!(ep.tvshow_id, \"tt0096697\");\n    }\n}\n"
  },
  {
    "path": "imdb-index/src/index/id.rs",
    "content": "use std::fs::File;\nuse std::io;\nuse std::path::Path;\n\nuse memmap::Mmap;\n\nuse crate::error::{Error, Result};\nuse crate::util::{fst_map_builder_file, fst_map_file};\n\n/// An index that maps arbitrary length identifiers to 64-bit integers.\n///\n/// An ID index is often useful for mapping human readable identifiers or\n/// \"natural keys\" to other more convenient forms, such as file offsets.\n#[derive(Debug)]\npub struct IndexReader {\n    idx: fst::Map<Mmap>,\n}\n\nimpl IndexReader {\n    /// Open's an ID index reader from the given file path.\n    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<IndexReader> {\n        // We claim it is safe to open the following memory map because we\n        // don't mutate them and no other process (should) either.\n        Ok(IndexReader { idx: unsafe { fst_map_file(path)? } })\n    }\n\n    /// Return the integer associated with the given ID, if it exists.\n    pub fn get(&self, key: &[u8]) -> Option<u64> {\n        self.idx.get(key)\n    }\n}\n\n/// An ID index writer that requires that identifiers are given in\n/// lexicographically ascending order.\npub struct IndexSortedWriter<W> {\n    wtr: fst::MapBuilder<W>,\n}\n\nimpl IndexSortedWriter<io::BufWriter<File>> {\n    /// Create an index writer that writes the index to the given file path.\n    pub fn from_path<P: AsRef<Path>>(\n        path: P,\n    ) -> Result<IndexSortedWriter<io::BufWriter<File>>> {\n        Ok(IndexSortedWriter { wtr: fst_map_builder_file(path)? })\n    }\n}\n\nimpl<W: io::Write> IndexSortedWriter<W> {\n    /// Associate the given identifier with the given integer.\n    ///\n    /// If the given key is not strictly lexicographically greater than the\n    /// previous key, then an error is returned.\n    pub fn insert(&mut self, key: &[u8], value: u64) -> Result<()> {\n        self.wtr.insert(key, value).map_err(Error::fst)?;\n        Ok(())\n    }\n\n    /// Finish writing the index.\n    ///\n    /// This must be called, otherwise the index will likely be unreadable.\n    pub fn finish(self) -> Result<()> {\n        self.wtr.finish().map_err(Error::fst)?;\n        Ok(())\n    }\n}\n"
  },
  {
    "path": "imdb-index/src/index/mod.rs",
    "content": "use std::fs;\nuse std::io;\nuse std::path::{Path, PathBuf};\nuse std::thread;\nuse std::time::Instant;\n\nuse memmap::Mmap;\nuse serde::{Deserialize, Serialize};\n\nuse crate::error::{Error, Result};\nuse crate::record::{Episode, Rating, Title, TitleKind};\nuse crate::scored::SearchResults;\nuse crate::util::{\n    create_file, csv_file, csv_mmap, open_file, NiceDuration, IMDB_BASICS,\n};\n\npub use self::aka::AKARecordIter;\npub use self::names::{NameQuery, NameScorer, NgramType};\n\nmod aka;\nmod episode;\nmod id;\nmod names;\nmod rating;\n#[cfg(test)]\nmod tests;\nmod writer;\n\n/// The version of the index format on disk.\n///\n/// Generally speaking, if the version of the index on disk doesn't exactly\n/// match the version expected by this code, then the index won't be read.\n/// The caller must then re-generate the index.\n///\n/// This version represents all indexing structures on disk in this module.\nconst VERSION: u64 = 1;\n\n/// The name of the title file index.\n///\n/// This index represents a map from the IMDb title ID to the file offset\n/// corresponding to that record in title.basics.tsv.\nconst TITLE: &str = \"title.fst\";\n\n/// The name of the file containing the index configuration.\n///\n/// The index configuration is a JSON file with some meta data about this\n/// index, such as its version.\nconst CONFIG: &str = \"config.json\";\n\n/// A media entity is a title with optional episode and rating records.\n///\n/// A media entity makes it convenient to deal with the complete information\n/// of an IMDb media record. This is the default value returned by search\n/// routines such as what the [`Searcher`](struct.Searcher.html) provides, and\n/// can also be cheaply constructed by an [`Index`](struct.Index.html) given a\n/// [`Title`](struct.Title.html) or an IMDb ID.\n#[derive(Clone, Debug)]\npub struct MediaEntity {\n    title: Title,\n    episode: Option<Episode>,\n    rating: Option<Rating>,\n}\n\nimpl MediaEntity {\n    /// Return a reference to the underlying `Title`.\n    pub fn title(&self) -> &Title {\n        &self.title\n    }\n\n    /// Return a reference to the underlying `Episode`, if it exists.\n    pub fn episode(&self) -> Option<&Episode> {\n        self.episode.as_ref()\n    }\n\n    /// Return a reference to the underlying `Rating`, if it exists.\n    pub fn rating(&self) -> Option<&Rating> {\n        self.rating.as_ref()\n    }\n}\n\n/// An index into IMDb titles and their associated data.\n///\n/// This index consists of a set of on disk index data structures in addition\n/// to the uncompressed IMDb `tsv` files. The on disk index structures are used\n/// to provide access to the records in the `tsv` files efficiently.\n///\n/// With this index, one can do the following things:\n///\n/// * Return a ranked list\n///   [`Title`](struct.Title.html)\n///   records matching a fuzzy name query.\n/// * Access any `Title` record by ID in constant time.\n/// * Access all\n///   [`AKA`](struct.AKA.html)\n///   records for any `Title` in constant time.\n/// * Access the\n///   [`Rating`](struct.Rating.html)\n///   for any `Title` in constant time.\n/// * Access the complete set of\n///   [`Episode`](struct.Episode.html)\n///   records for any TV show in constant time.\n/// * Access the specific `Episode` given its ID in constant time.\n#[derive(Debug)]\npub struct Index {\n    /// The directory containing the IMDb tsv files.\n    data_dir: PathBuf,\n    /// The directory containing this crate's index structures.\n    index_dir: PathBuf,\n    /// A seekable reader for `title.basics.tsv`. The index structures\n    /// typically return offsets that can be used to seek this reader to the\n    /// beginning of any `Title` record.\n    csv_basic: csv::Reader<io::Cursor<Mmap>>,\n    /// The name index. This is what provides fuzzy queries.\n    idx_names: names::IndexReader,\n    /// The AKA index.\n    idx_aka: aka::Index,\n    /// The episode index.\n    idx_episode: episode::Index,\n    /// The rating index.\n    idx_rating: rating::Index,\n    /// The title index.\n    idx_title: id::IndexReader,\n}\n\n#[derive(Debug, Deserialize, Serialize)]\nstruct Config {\n    version: u64,\n}\n\nimpl Index {\n    /// Open an existing index using default settings. If the index does not\n    /// exist, or if there was a problem opening it, then this returns an\n    /// error.\n    ///\n    /// Generally, this method is cheap to call. It opens some file\n    /// descriptors, but otherwise does no work.\n    ///\n    /// `data_dir` should be the directory containing decompressed IMDb\n    /// `tsv` files. See: https://www.imdb.com/interfaces/\n    ///\n    /// `index_dir` should be the directory containing a previously created\n    /// index using `Index::create`.\n    pub fn open<P1: AsRef<Path>, P2: AsRef<Path>>(\n        data_dir: P1,\n        index_dir: P2,\n    ) -> Result<Index> {\n        IndexBuilder::new().open(data_dir, index_dir)\n    }\n\n    /// Create a new index using default settings.\n    ///\n    /// Calling this method is expensive, and one should expect this to take\n    /// dozens of seconds or more to complete.\n    ///\n    /// `data_dir` should be the directory containing decompressed IMDb tsv`\n    /// `files. See: https://www.imdb.com/interfaces/\n    ///\n    /// `index_dir` should be the directory containing a previously created\n    /// index using `Index::create`.\n    ///\n    /// This will overwrite any previous index that may have existed in\n    /// `index_dir`.\n    pub fn create<P1: AsRef<Path>, P2: AsRef<Path>>(\n        data_dir: P1,\n        index_dir: P2,\n    ) -> Result<Index> {\n        IndexBuilder::new().create(data_dir, index_dir)\n    }\n\n    /// Attempt to clone this index, returning a distinct `Index`.\n    ///\n    /// This is as cheap to call as `Index::open` and returns an error if there\n    /// was a problem reading the underlying index.\n    ///\n    /// This is useful when one wants to query the same `Index` on disk from\n    /// multiple threads.\n    pub fn try_clone(&self) -> Result<Index> {\n        Index::open(&self.data_dir, &self.index_dir)\n    }\n\n    /// Search this index for `Title` records whose name matches the given\n    /// query.\n    ///\n    /// The query controls the following things:\n    ///\n    /// * The name to search for.\n    /// * The maximum number of results returned.\n    /// * The scorer to use to rank results.\n    ///\n    /// The name can be any string. It is normalized and broken down into\n    /// component pieces, which are then used to quickly search all existing\n    /// titles quickly and fuzzily.\n    ///\n    /// This returns an error if there was a problem reading the index or the\n    /// underlying CSV data.\n    pub fn search(\n        &mut self,\n        query: &names::NameQuery,\n    ) -> Result<SearchResults<Title>> {\n        let mut results = SearchResults::new();\n        // The name index gives us back scores with offsets. The offset can be\n        // used to seek our `Title` CSV reader to the corresponding record and\n        // read it in constant time.\n        for result in self.idx_names.search(query) {\n            let title = match self.read_record(*result.value())? {\n                None => continue,\n                Some(title) => title,\n            };\n            results.push(result.map(|_| title));\n        }\n        Ok(results)\n    }\n\n    /// Returns the `MediaEntity` for the given IMDb ID.\n    ///\n    /// An entity includes an [`Episode`](struct.Episode.html) and\n    /// [`Rating`](struct.Rating.html) records if they exist for the title.\n    ///\n    /// This returns an error if there was a problem reading the underlying\n    /// index. If no such title exists for the given ID, then `None` is\n    /// returned.\n    pub fn entity(&mut self, id: &str) -> Result<Option<MediaEntity>> {\n        match self.title(id)? {\n            None => Ok(None),\n            Some(title) => self.entity_from_title(title).map(Some),\n        }\n    }\n\n    /// Returns the `MediaEntity` for the given `Title`.\n    ///\n    /// This is like the `entity` method, except it takes a `Title` record as\n    /// given.\n    pub fn entity_from_title(&mut self, title: Title) -> Result<MediaEntity> {\n        let episode = match title.kind {\n            TitleKind::TVEpisode => self.episode(&title.id)?,\n            _ => None,\n        };\n        let rating = self.rating(&title.id)?;\n        Ok(MediaEntity { title, episode, rating })\n    }\n\n    /// Returns the `Title` record for the given IMDb ID.\n    ///\n    /// This returns an error if there was a problem reading the underlying\n    /// index. If no such title exists for the given ID, then `None` is\n    /// returned.\n    pub fn title(&mut self, id: &str) -> Result<Option<Title>> {\n        match self.idx_title.get(id.as_bytes()) {\n            None => Ok(None),\n            Some(offset) => self.read_record(offset),\n        }\n    }\n\n    /// Returns an iterator over all `AKA` records for the given IMDb ID.\n    ///\n    /// If no AKA records exist for the given ID, then an empty iterator is\n    /// returned.\n    ///\n    /// If there was a problem reading the index, then an error is returned.\n    pub fn aka_records(&mut self, id: &str) -> Result<AKARecordIter> {\n        self.idx_aka.find(id.as_bytes())\n    }\n\n    /// Returns the `Rating` associated with the given IMDb ID.\n    ///\n    /// If no rating exists for the given ID, then this returns `None`.\n    ///\n    /// If there was a problem reading the index, then an error is returned.\n    pub fn rating(&mut self, id: &str) -> Result<Option<Rating>> {\n        self.idx_rating.rating(id.as_bytes())\n    }\n\n    /// Returns all of the episodes for the given TV show. The TV show should\n    /// be identified by its IMDb ID.\n    ///\n    /// If the given ID isn't a TV show or if the TV show doesn't have any\n    /// episodes, then an empty list is returned.\n    ///\n    /// The episodes returned are sorted in order of their season and episode\n    /// numbers. Episodes without a season or episode number are sorted after\n    /// episodes with a season or episode number.\n    ///\n    /// If there was a problem reading the index, then an error is returned.\n    pub fn seasons(&mut self, tvshow_id: &str) -> Result<Vec<Episode>> {\n        self.idx_episode.seasons(tvshow_id.as_bytes())\n    }\n\n    /// Returns all of the episodes for the given TV show and season number.\n    /// The TV show should be identified by its IMDb ID, and the season should\n    /// be identified by its number. (Season numbers generally start at `1`.)\n    ///\n    /// If the given ID isn't a TV show or if the TV show doesn't have any\n    /// episodes for the given season, then an empty list is returned.\n    ///\n    /// The episodes returned are sorted in order of their episode numbers.\n    /// Episodes without an episode number are sorted after episodes with an\n    /// episode number.\n    ///\n    /// If there was a problem reading the index, then an error is returned.\n    pub fn episodes(\n        &mut self,\n        tvshow_id: &str,\n        season: u32,\n    ) -> Result<Vec<Episode>> {\n        self.idx_episode.episodes(tvshow_id.as_bytes(), season)\n    }\n\n    /// Return the episode corresponding to the given IMDb ID.\n    ///\n    /// If the ID doesn't correspond to an episode, then `None` is returned.\n    ///\n    /// If there was a problem reading the index, then an error is returned.\n    pub fn episode(&mut self, episode_id: &str) -> Result<Option<Episode>> {\n        self.idx_episode.episode(episode_id.as_bytes())\n    }\n\n    /// Returns the data directory that this index returns results for.\n    pub fn data_dir(&self) -> &Path {\n        &self.data_dir\n    }\n\n    /// Returns the directory containing this index's files.\n    pub fn index_dir(&self) -> &Path {\n        &self.index_dir\n    }\n\n    /// Read the CSV `Title` record beginning at the given file offset.\n    ///\n    /// If no such record exists, then this returns `None`.\n    ///\n    /// If there was a problem reading the underlying CSV data, then an error\n    /// is returned.\n    ///\n    /// If the given offset does not point to the start of a record in the CSV\n    /// data, then the behavior of this method is unspecified.\n    fn read_record(&mut self, offset: u64) -> Result<Option<Title>> {\n        let mut pos = csv::Position::new();\n        pos.set_byte(offset);\n        self.csv_basic.seek(pos).map_err(Error::csv)?;\n\n        let mut record = csv::StringRecord::new();\n        if !self.csv_basic.read_record(&mut record).map_err(Error::csv)? {\n            Ok(None)\n        } else {\n            let headers = self.csv_basic.headers().map_err(Error::csv)?;\n            Ok(record.deserialize(Some(headers)).map_err(Error::csv)?)\n        }\n    }\n}\n\n/// A builder for opening or creating an `Index`.\n#[derive(Debug)]\npub struct IndexBuilder {\n    ngram_type: NgramType,\n    ngram_size: usize,\n}\n\nimpl IndexBuilder {\n    /// Create a new builder with a default configuration.\n    pub fn new() -> IndexBuilder {\n        IndexBuilder { ngram_type: NgramType::default(), ngram_size: 3 }\n    }\n\n    /// Use the current configuration to open an existing index. If the index\n    /// does not exist, or if there was a problem opening it, then this returns\n    /// an error.\n    ///\n    /// Generally, this method is cheap to call. It opens some file\n    /// descriptors, but otherwise does no work.\n    ///\n    /// `data_dir` should be the directory containing decompressed IMDb tsv`\n    /// `files. See: https://www.imdb.com/interfaces/\n    ///\n    /// `index_dir` should be the directory containing a previously created\n    /// index using `Index::create`.\n    ///\n    /// Note that settings for index creation are ignored.\n    pub fn open<P1: AsRef<Path>, P2: AsRef<Path>>(\n        &self,\n        data_dir: P1,\n        index_dir: P2,\n    ) -> Result<Index> {\n        let data_dir = data_dir.as_ref();\n        let index_dir = index_dir.as_ref();\n        log::debug!(\"opening index {}\", index_dir.display());\n\n        let config_file = open_file(index_dir.join(CONFIG))?;\n        let config: Config = serde_json::from_reader(config_file)\n            .map_err(|e| Error::config(e.to_string()))?;\n        if config.version != VERSION {\n            return Err(Error::version(VERSION, config.version));\n        }\n\n        Ok(Index {\n            data_dir: data_dir.to_path_buf(),\n            index_dir: index_dir.to_path_buf(),\n            // We claim it is safe to open the following memory map because we\n            // don't mutate them and no other process (should) either.\n            csv_basic: unsafe { csv_mmap(data_dir.join(IMDB_BASICS))? },\n            idx_names: names::IndexReader::open(index_dir)?,\n            idx_aka: aka::Index::open(data_dir, index_dir)?,\n            idx_episode: episode::Index::open(index_dir)?,\n            idx_rating: rating::Index::open(index_dir)?,\n            idx_title: id::IndexReader::from_path(index_dir.join(TITLE))?,\n        })\n    }\n\n    /// Use the current configuration to create a new index.\n    ///\n    /// Calling this method is expensive, and one should expect this to take\n    /// dozens of seconds or more to complete.\n    ///\n    /// `data_dir` should be the directory containing decompressed IMDb tsv`\n    /// `files. See: https://www.imdb.com/interfaces/\n    ///\n    /// `index_dir` should be the directory containing a previously created\n    /// index using `Index::create`.\n    ///\n    /// This will overwrite any previous index that may have existed in\n    /// `index_dir`.\n    pub fn create<P1: AsRef<Path>, P2: AsRef<Path>>(\n        &self,\n        data_dir: P1,\n        index_dir: P2,\n    ) -> Result<Index> {\n        let data_dir = data_dir.as_ref();\n        let index_dir = index_dir.as_ref();\n        fs::create_dir_all(index_dir)\n            .map_err(|e| Error::io_path(e, index_dir))?;\n        log::info!(\"creating index at {}\", index_dir.display());\n\n        // Creating the rating and episode indices are completely independent\n        // from the name/AKA indexes, so do them in a background thread. The\n        // episode index takes long enough to build to justify this.\n        let job = {\n            let data_dir = data_dir.to_path_buf();\n            let index_dir = index_dir.to_path_buf();\n            thread::spawn(move || -> Result<()> {\n                let start = Instant::now();\n                rating::Index::create(&data_dir, &index_dir)?;\n                log::info!(\n                    \"created rating index (took {})\",\n                    NiceDuration::since(start)\n                );\n\n                let start = Instant::now();\n                episode::Index::create(&data_dir, &index_dir)?;\n                log::info!(\n                    \"created episode index (took {})\",\n                    NiceDuration::since(start)\n                );\n                Ok(())\n            })\n        };\n\n        let start = Instant::now();\n        let mut aka_index = aka::Index::create(data_dir, index_dir)?;\n        log::info!(\"created AKA index (took {})\", NiceDuration::since(start));\n\n        let start = Instant::now();\n        create_name_index(\n            &mut aka_index,\n            data_dir,\n            index_dir,\n            self.ngram_type,\n            self.ngram_size,\n        )?;\n        log::info!(\n            \"created name index, ngram type: {}, ngram size: {} (took {})\",\n            self.ngram_type,\n            self.ngram_size,\n            NiceDuration::since(start)\n        );\n\n        job.join().unwrap()?;\n\n        // Write out our config.\n        let config_file = create_file(index_dir.join(CONFIG))?;\n        serde_json::to_writer_pretty(\n            config_file,\n            &Config { version: VERSION },\n        )\n        .map_err(|e| Error::config(e.to_string()))?;\n\n        self.open(data_dir, index_dir)\n    }\n\n    /// Set the type of ngram generation to use.\n    ///\n    /// The default type is `Window`.\n    pub fn ngram_type(&mut self, ngram_type: NgramType) -> &mut IndexBuilder {\n        self.ngram_type = ngram_type;\n        self\n    }\n\n    /// Set the ngram size on this index.\n    ///\n    /// When creating an index, ngrams with this size will be used.\n    pub fn ngram_size(&mut self, ngram_size: usize) -> &mut IndexBuilder {\n        self.ngram_size = ngram_size;\n        self\n    }\n}\n\nimpl Default for IndexBuilder {\n    fn default() -> IndexBuilder {\n        IndexBuilder::new()\n    }\n}\n\n/// Creates the name index from the title tsv data and an AKA index. The AKA\n/// index is used to index additional names for each title record to improve\n/// recall during search.\n///\n/// To avoid a second pass through the title records, this also creates the\n/// title ID index, which provides an index for looking up a `Title` by its\n/// ID in constant time.\nfn create_name_index(\n    aka_index: &mut aka::Index,\n    data_dir: &Path,\n    index_dir: &Path,\n    ngram_type: NgramType,\n    ngram_size: usize,\n) -> Result<()> {\n    // For logging.\n    let (mut count, mut title_count) = (0u64, 0u64);\n\n    let mut wtr = names::IndexWriter::open(index_dir, ngram_type, ngram_size)?;\n    let mut twtr = id::IndexSortedWriter::from_path(index_dir.join(TITLE))?;\n\n    let mut rdr = csv_file(data_dir.join(IMDB_BASICS))?;\n    let mut record = csv::StringRecord::new();\n    while rdr.read_record(&mut record).map_err(Error::csv)? {\n        let pos = record.position().expect(\"position on row\");\n        let id = &record[0];\n        let title = &record[2];\n        let original_title = &record[3];\n        let is_adult = &record[4] == \"1\";\n        if is_adult {\n            // TODO: Expose an option to permit this.\n            continue;\n        }\n        count += 1;\n        title_count += 1;\n\n        twtr.insert(id.as_bytes(), pos.byte())?;\n        // Index the primary name.\n        wtr.insert(pos.byte(), title)?;\n        if title != original_title {\n            // Index the \"original\" name.\n            wtr.insert(pos.byte(), original_title)?;\n            count += 1;\n        }\n        // Now index all of the alternate names, if they exist.\n        for result in aka_index.find(id.as_bytes())? {\n            let akarecord = result?;\n            if title != akarecord.title {\n                wtr.insert(pos.byte(), &akarecord.title)?;\n                count += 1;\n            }\n        }\n    }\n    wtr.finish()?;\n    twtr.finish()?;\n\n    log::info!(\"{} titles indexed\", title_count);\n    log::info!(\"{} total names indexed\", count);\n    Ok(())\n}\n"
  },
  {
    "path": "imdb-index/src/index/names.rs",
    "content": "use std::cmp;\nuse std::collections::{binary_heap, BinaryHeap};\nuse std::fmt;\nuse std::fs::File;\nuse std::io::{self, Write};\nuse std::path::Path;\nuse std::str::{self, FromStr};\nuse std::time::Instant;\n\nuse fnv::FnvHashMap;\nuse memmap::Mmap;\nuse serde::{Deserialize, Serialize};\n\nuse crate::error::{Error, Result};\nuse crate::index::writer::CursorWriter;\nuse crate::scored::{Scored, SearchResults};\nuse crate::util::{\n    fst_map_builder_file, fst_map_file, mmap_file, open_file, NiceDuration,\n};\n\n/// The name of the file containing the index configuration.\n///\n/// The index configuration is a JSON file with some meta data about this\n/// index, such as its version, ngram size and aggregate statistics about the\n/// corpus that has been indexed.\nconst CONFIG: &str = \"names.config.json\";\n\n/// The name of the ngram term index.\n///\n/// The ngram term index maps ngrams (fixed size sequences of Unicode\n/// codepoints) to file offsets. Each file offset points to the postings for\n/// the corresponding term.\nconst NGRAM: &str = \"names.ngram.fst\";\n\n/// The name of the postings list index.\n///\n/// The postings list contains an entry for every term in the ngram index.\n/// Each entry corresponds to a list of document/frequency pairs. Namely, each\n/// entry is a DocID and a frequency count indicating how many times the\n/// corresponding term appeared in that document. Each entry in the list is\n/// encoded as a single 32 little-endian integer. The high 4 bits represent\n/// the frequency (which is capped at 15, a reasonable number for indexing\n/// short name strings) while the low 28 bits represent the doc id. The\n/// `MAX_DOC_ID` constant below ensures we make sure to never use a doc id\n/// that won't fit this encoding scheme.\n///\n/// The last eight bytes in the postings index contains a 64-bit little-endian\n/// encoded integer indicating the average length of all documents represented\n/// by the ngram index. The length is recorded in units of terms, which\n/// generally correspond to the total number of ngrams in a name.\nconst POSTINGS: &str = \"names.postings.idx\";\n\n/// The name of the identifier map index.\n///\n/// This file maps `DocID`s to `NameID`s. It consists of a sequence of\n/// 64-bit little-endian encoded integers, where the length of the sequence\n/// corresponds to the total number of names in the index. Each entry in the\n/// sequence encodes a `NameID`. In other words, the index to this sequence is\n/// a `DocID` and the value at that index is a `NameID`.\n///\n/// The id map is used to map doc ids returned by the postings to name ids\n/// which were provided by the caller. This also permits search to deduplicate\n/// results. That is, we should never return multiple results for the same\n/// NameID, even though we may have indexed multiple names for the same name\n/// id.\nconst IDMAP: &str = \"names.idmap.idx\";\n\n/// The name of the document length index.\n///\n/// This file consists of a sequence of 16-bit little-endian encoded\n/// integers, where the length of the sequence corresponds to the total number\n/// of names in the index. Each entry represents the length, in terms, of each\n/// name.\n///\n/// The lengths are used during scoring to compute a normalization term. This\n/// allows the scoring mechanism to take document length into account.\nconst NORMS: &str = \"names.norms.idx\";\n\n/// The external identifier for every distinct record represented by this name\n/// index. There are no restrictions on name ids, and multiple names may be\n/// indexed that correspond to the same name id.\n///\n/// With respect to IMDb, there is a 1-to-1 correspondence between the records\n/// in title.basics.tsv and the set of NameIDs, even though there may be\n/// multiple names for each record.\n///\n/// For IMDb, this is represented by the byte offset of the corresponding\n/// record in title.basics.tsv. This provides constant time lookup to full\n/// record. Note, though, that this module knows nothing about such things.\n/// To this module, name ids are opaque identifiers.\npub type NameID = u64;\n\n/// An internal surrogate identifier for every distinct name in the index. Note\n/// that multiple distinct doc ids can map to the same name id. For example, if\n/// a name has multiple distinct forms, then they each get their own docid, but\n/// each of the docids will map to the same name id.\n///\n/// The reason why we need DocID in addition to NameID is two fold:\n///\n/// 1. Firstly, we'd like each name variant to have its own term frequency\n///    count. If every variant shared the same internal id, then names with\n///    multiple variants would behave as if they were one long name with each\n///    variant concatenated together. Our ranking scheme takes document length\n///    into account, so we don't want this.\n/// 2. Secondly, using an internal ID gives us control over the structure of\n///    those ids. For example, we can declare them to be a sorted sequence of\n///    increasing integers. This lets us traverse our postings more efficiently\n///    during search.\ntype DocID = u32;\n\n/// The maximum docid allowed.\n///\n/// When writing postings, we pack docids and their term frequency counts into\n/// a single u32. We give 4 bits for frequency and 28 bits for docid. That\n/// means we can permit up to 268,435,455 = (1<<28)-1 names, which is plenty\n/// for all unique names in IMDb.\nconst MAX_DOC_ID: DocID = (1 << 28) - 1;\n\n/// A query for searching the name index.\n///\n/// A query provides the name query and defines the maximum number of results\n/// returned by searching the name index.\n#[derive(Clone, Debug)]\npub struct NameQuery {\n    name: String,\n    size: usize,\n    scorer: NameScorer,\n    stop_word_ratio: f64,\n}\n\nimpl NameQuery {\n    /// Create a query that searches the given name.\n    pub fn new(name: &str) -> NameQuery {\n        NameQuery {\n            name: name.to_string(),\n            size: 30,\n            scorer: NameScorer::default(),\n            stop_word_ratio: 0.01,\n        }\n    }\n\n    /// Set this query's result set size. At most `size` results will be\n    /// returned when searching with this query.\n    pub fn with_size(self, size: usize) -> NameQuery {\n        NameQuery { size, ..self }\n    }\n\n    /// Set this query's scorer. By default, Okapi BM25 is used.\n    pub fn with_scorer(self, scorer: NameScorer) -> NameQuery {\n        NameQuery { scorer, ..self }\n    }\n\n    /// Set the ratio (in the range `0.0` to `1.0`, inclusive) at which a term\n    /// is determined to be a stop word. Set to `0.0` to disable. By default\n    /// this is set to a non-zero value.\n    ///\n    /// This ratio is used at query time to partition all of the ngrams in the\n    /// query into two bins: one bin is for \"low frequency\" ngrams while the\n    /// other is for \"high frequency\" ngrams. The partitioning is determined\n    /// by this ratio. Namely, if an ngram occurs in fewer than `ratio`\n    /// documents in the entire corpus, then it is considered a low frequency\n    /// ngram.\n    ///\n    /// Once these two partitions are created, both are used to create two\n    /// disjunction queries. The low frequency query drives search results,\n    /// while the high frequency query is only used to boost scores when it\n    /// matches a result yielded by the low frequency query. Otherwise, results\n    /// from the high frequency query aren't considered.\n    pub fn with_stop_word_ratio(self, ratio: f64) -> NameQuery {\n        NameQuery { stop_word_ratio: ratio, ..self }\n    }\n}\n\n/// A reader for the name index.\n#[derive(Debug)]\npub struct IndexReader {\n    /// The configuration of this index. This is how we determine index-time\n    /// settings automatically, such as ngram size and type.\n    config: Config,\n    /// The ngram index, also known more generally as the \"term index.\" It maps\n    /// terms (which are ngrams for this index) to offsets into the postings\n    /// file. The offset indicates the start of a list of document ids\n    /// containing that term.\n    ngram: fst::Map<Mmap>,\n    /// The postings. This corresponds to a sequence of lists, where each list\n    /// is a list of document ID/frequency pairs. Each list corresponds to the\n    /// document ids containing a particular term. The beginning of each list\n    /// is pointed to by an offset in the term index.\n    postings: Mmap,\n    /// A sequence of 64-bit little-endian encoded integers that provide a\n    /// map from document ID to name ID. The document ID is an internal\n    /// identifier assigned to each unique name indexed, while the name ID is\n    /// an external identifier provided by users of this index.\n    ///\n    /// This map is used to return name IDs to callers. Namely, results are\n    /// natively represented by document IDs, but they are mapped to name IDs\n    /// during collection of results and subsequently deduped. In particular,\n    /// multiple document IDs can map to the same name ID.\n    ///\n    /// The number of entries in this map is equivalent to the total number of\n    /// names indexed.\n    idmap: Mmap,\n    /// A sequence of 16-bit little-endian encoded integers indicating the\n    /// document length (in terms) of the correspond document ID.\n    ///\n    /// The number of entries in this map is equivalent to the total number of\n    /// names indexed.\n    norms: Mmap,\n}\n\n/// The configuration for this name index. It is JSON encoded to disk.\n///\n/// Note that we don't track the version here. Instead, it is tracked wholesale\n/// as part of the parent index.\n#[derive(Debug, Deserialize, Serialize)]\nstruct Config {\n    ngram_type: NgramType,\n    ngram_size: usize,\n    avg_document_len: f64,\n    num_documents: u64,\n}\n\nimpl IndexReader {\n    /// Open a name index in the given directory.\n    pub fn open<P: AsRef<Path>>(dir: P) -> Result<IndexReader> {\n        let dir = dir.as_ref();\n\n        // All of the following open memory maps. We claim it is safe because\n        // we don't mutate them and no other process (should) either.\n        let ngram = unsafe { fst_map_file(dir.join(NGRAM))? };\n        let postings = unsafe { mmap_file(dir.join(POSTINGS))? };\n        let idmap = unsafe { mmap_file(dir.join(IDMAP))? };\n        let norms = unsafe { mmap_file(dir.join(NORMS))? };\n\n        let config_file = open_file(dir.join(CONFIG))?;\n        let config: Config = serde_json::from_reader(config_file)\n            .map_err(|e| Error::config(e.to_string()))?;\n        Ok(IndexReader { config, ngram, postings, idmap, norms })\n    }\n\n    /// Execute a search.\n    pub fn search(&self, query: &NameQuery) -> SearchResults<NameID> {\n        let start = Instant::now();\n        let mut searcher = Searcher::new(self, query);\n        let results = CollectTopK::new(query.size).collect(&mut searcher);\n        log::debug!(\n            \"search for {:?} took {}\",\n            query,\n            NiceDuration::since(start)\n        );\n        results\n    }\n\n    /// Return the name ID used to the index the given document id.\n    ///\n    /// This panics if the given document id does not correspond to an indexed\n    /// document.\n    fn docid_to_nameid(&self, docid: DocID) -> NameID {\n        let start = 8 * (docid as usize);\n        let buf = self.idmap[start..start + 8].try_into().unwrap();\n        u64::from_le_bytes(buf)\n    }\n\n    /// Return the length, in terms, of the given document.\n    ///\n    /// This panics if the given document id does not correspond to an indexed\n    /// document.\n    fn document_length(&self, docid: DocID) -> u64 {\n        let start = 2 * (docid as usize);\n        let buf = self.norms[start..start + 2].try_into().unwrap();\n        u16::from_le_bytes(buf) as u64\n    }\n}\n\n/// A collector for gathering the top K results from a search.\n///\n/// This maintains a min-heap of search results. When a new result is\n/// considered, it is compared against the worst result in the heap. If the\n/// candidate is worse, then it is discarded. Otherwise, it is shuffled into\n/// the heap.\nstruct CollectTopK {\n    /// The total number of hits to collect.\n    k: usize,\n    /// The min-heap, according to score. Note that since BinaryHeap is a\n    /// max-heap by default, we reverse the comparison to get a min-heap.\n    queue: BinaryHeap<cmp::Reverse<Scored<NameID>>>,\n    /// A set for deduplicating results. Namely, multiple doc IDs can map to\n    /// the same name ID. This set makes sure we only collect one name ID.\n    ///\n    /// We map name IDs to scores. In this way, we always report the best\n    /// scoring match.\n    byid: FnvHashMap<NameID, f64>,\n}\n\nimpl CollectTopK {\n    /// Build a new collector that collects at most `k` results.\n    fn new(k: usize) -> CollectTopK {\n        CollectTopK {\n            k,\n            queue: BinaryHeap::with_capacity(k),\n            byid: FnvHashMap::default(),\n        }\n    }\n\n    /// Collect the top K results from the given searcher using the given\n    /// index reader. Return the results with normalized scores sorted in\n    /// order of best-to-worst.\n    fn collect(mut self, searcher: &mut Searcher) -> SearchResults<NameID> {\n        if self.k == 0 {\n            return SearchResults::new();\n        }\n        let index = searcher.index();\n        let (mut count, mut push_count) = (0, 0);\n        for scored_with_docid in searcher {\n            count += 1;\n            let scored = scored_with_docid.map(|v| index.docid_to_nameid(v));\n            // Since multiple names can correspond to a single IMDb title,\n            // we dedup our results here. That is, if our result set\n            // already contains this result, then update the score if need\n            // be, and then move on.\n            if let Some(&score) = self.byid.get(scored.value()) {\n                if scored.score() > score {\n                    self.byid.insert(*scored.value(), scored.score());\n                }\n                continue;\n            }\n\n            let mut dopush = self.queue.len() < self.k;\n            if !dopush {\n                // This unwrap is OK because k > 0 and queue is non-empty.\n                let worst = self.queue.peek_mut().unwrap();\n                // If our queue is full, then we should only push if this\n                // doc id has a better score than the worst one in the queue.\n                if worst.0 < scored {\n                    self.byid.remove(worst.0.value());\n                    binary_heap::PeekMut::pop(worst);\n                    dopush = true;\n                }\n            }\n            if dopush {\n                push_count += 1;\n                self.byid.insert(*scored.value(), scored.score());\n                self.queue.push(cmp::Reverse(scored));\n            }\n        }\n        log::debug!(\n            \"collect count: {:?}, collect push count: {:?}\",\n            count,\n            push_count\n        );\n\n        // Pull out the results from our heap and normalize the scores.\n        let mut results = SearchResults::from_min_heap(&mut self.queue);\n        results.normalize();\n        results\n    }\n}\n\n/// A searcher for resolving fulltext queries.\n///\n/// A searcher takes a fulltext query, usually typed by an end user, along with\n/// a scoring function and produces a stream of matching results with scores\n/// computed via the provided function. Results are always yielded in\n/// ascending order with respect to document IDs, which are internal IDs\n/// assigned to each name in the index.\n///\n/// This searcher combines a bit of smarts to handle stop words, usually\n/// referred to as \"dynamic stop word detection.\" Namely, after the searcher\n/// splits the query into ngrams, it partitions the ngrams into infrequently\n/// occurring ngrams and frequently occurring ngrams, according to some\n/// hard-coded threshold. Each group is then turned into a `Disjunction`\n/// query. The searcher then visits every doc ID that matches the infrequently\n/// occurring disjunction. When a score is computed for a doc ID, then its\n/// score is increased if the frequently occurring disjunction also contains\n/// that same doc ID. Otherwise, the frequently occurring disjunction isn't\n/// consulted at all, which permits skipping the score calculation for a\n/// potentially large number of doc IDs.\n///\n/// When two partitions cannot be created (e.g., all of the terms are\n/// infrequently occurring or all of the terms are frequently occurring), then\n/// only one disjunction query is used and no skipping logic is employed. That\n/// means that a query consisting of all high frequency terms could be quite\n/// slow.\n///\n/// This does of course sacrifice recall for a performance benefit, but so do\n/// all filtering strategies based on stop words. The benefit of this \"dynamic\"\n/// approach is that stop word detection is tailored exactly to the corpus, and\n/// that stop words can still influence scoring. That means queries like \"the\n/// matrix\" will match \"The Matrix\" better than \"Matrix\" (which is a legitimate\n/// example, try it).\nstruct Searcher<'i> {\n    /// A handle to the index.\n    index: &'i IndexReader,\n    /// The primary disjunction query that drives results. Typically, this\n    /// corresponds to the infrequent terms in the query.\n    primary: Disjunction<'i>,\n    /// A disjunction of only high frequency terms. When the query consists\n    /// of exclusively high frequency terms, then this is empty (which matches\n    /// nothing) and `primary` is set to the disjunction of terms.\n    high: Disjunction<'i>,\n}\n\nimpl<'i> Searcher<'i> {\n    /// Create a new searcher.\n    fn new(idx: &'i IndexReader, query: &NameQuery) -> Searcher<'i> {\n        let num_docs = idx.config.num_documents as f64;\n        let (mut low, mut high) = (vec![], vec![]);\n        let (mut low_terms, mut high_terms) = (vec![], vec![]);\n\n        let name = normalize_query(&query.name);\n        let mut query_len = 0;\n        let mut multiset = FnvHashMap::default();\n        idx.config.ngram_type.iter(idx.config.ngram_size, &name, |term| {\n            *multiset.entry(term).or_insert(0) += 1;\n            query_len += 1;\n        });\n        for (term, &count) in multiset.iter() {\n            let postings = PostingIter::new(idx, query.scorer, count, term);\n            let ratio = (postings.len() as f64) / num_docs;\n            if ratio < query.stop_word_ratio {\n                low.push(postings);\n                low_terms.push(format!(\"{}:{}:{:0.6}\", term, count, ratio));\n            } else {\n                high.push(postings);\n                high_terms.push(format!(\"{}:{}:{:0.6}\", term, count, ratio));\n            }\n        }\n        log::debug!(\"starting search for: {:?}\", name);\n        log::debug!(\"{:?} low frequency terms: {:?}\", low.len(), low_terms);\n        log::debug!(\"{:?} high frequency terms: {:?}\", high.len(), high_terms);\n\n        if low.is_empty() {\n            Searcher {\n                index: idx,\n                primary: Disjunction::new(idx, query_len, query.scorer, high),\n                high: Disjunction::empty(idx, query.scorer),\n            }\n        } else {\n            Searcher {\n                index: idx,\n                primary: Disjunction::new(idx, query_len, query.scorer, low),\n                high: Disjunction::new(idx, query_len, query.scorer, high),\n            }\n        }\n    }\n\n    /// Return a reference to the underlying index reader.\n    fn index(&self) -> &'i IndexReader {\n        self.index\n    }\n}\n\nimpl<'i> Iterator for Searcher<'i> {\n    type Item = Scored<DocID>;\n\n    fn next(&mut self) -> Option<Scored<DocID>> {\n        // This is pretty simple. We drive the iterator via the primary\n        // disjunction, which is usually a disjunction of infrequently\n        // occurring ngrams.\n        let mut scored = match self.primary.next() {\n            None => return None,\n            Some(scored) => scored,\n        };\n        // We then skip our frequently occurring disjunction to the doc ID\n        // yielded above. Any frequently occurring ngrams found then improve\n        // this score. This makes queries like 'the matrix' match 'The Matrix'\n        // better than 'Matrix'.\n        if let Some(other_scored) = self.high.skip_to(*scored.value()) {\n            scored = scored.map_score(|s| s + other_scored.score());\n        }\n        Some(scored)\n    }\n}\n\n/// A disjunction over a collection of ngrams. A disjunction yields scored\n/// document IDs for every document that contains any of the terms in this\n/// disjunction. The more ngrams that match the document in the disjunction,\n/// the better the score.\nstruct Disjunction<'i> {\n    /// A handle to the underlying index that we're searching.\n    index: &'i IndexReader,\n    /// The number of ngrams in the original query.\n    ///\n    /// This is not necessarily equivalent to the number of ngrams in this\n    /// specific disjunction. Namely, this is used to compute scores, and it\n    /// is important that scores are computed using the total number of ngrams\n    /// and not the number of ngrams in a specific disjunction. For example,\n    /// if a query consisted of 8 infrequent ngrams and 1 frequent ngram, then\n    /// the disjunction containing the single frequent ngram would contribute a\n    /// disproportionately high score.\n    query_len: f64,\n    /// The scoring function to use.\n    scorer: NameScorer,\n    /// A min-heap of posting iterators. Each posting iterator corresponds to\n    /// an iterator over (doc ID, frequency) pairs for a single ngram, sorted\n    /// by doc ID in ascending order.\n    ///\n    /// A min-heap is a classic way of optimally computing a disjunction over\n    /// an arbitrary number of ordered streams.\n    queue: BinaryHeap<PostingIter<'i>>,\n    /// Whether this disjunction has been exhausted or not.\n    is_done: bool,\n}\n\nimpl<'i> Disjunction<'i> {\n    /// Create a new disjunction over the given posting iterators.\n    fn new(\n        index: &'i IndexReader,\n        query_len: usize,\n        scorer: NameScorer,\n        posting_iters: Vec<PostingIter<'i>>,\n    ) -> Disjunction<'i> {\n        let mut queue = BinaryHeap::new();\n        for postings in posting_iters {\n            queue.push(postings);\n        }\n        let is_done = queue.is_empty();\n        let query_len = query_len as f64;\n        Disjunction { index, query_len, scorer, queue, is_done }\n    }\n\n    /// Create an empty disjunction that never matches anything.\n    fn empty(index: &'i IndexReader, scorer: NameScorer) -> Disjunction<'i> {\n        Disjunction {\n            index,\n            query_len: 0.0,\n            scorer,\n            queue: BinaryHeap::new(),\n            is_done: true,\n        }\n    }\n\n    /// Skip this disjunction such that all posting iterators are either\n    /// positioned at the smallest doc ID greater than the given doc ID.\n    ///\n    /// If any posting iterator contains the given doc ID, then it is scored\n    /// and returned. The score incorporates all posting iterators that contain\n    /// the given doc ID.\n    fn skip_to(&mut self, target_docid: DocID) -> Option<Scored<DocID>> {\n        if self.is_done {\n            return None;\n        }\n        let mut found = false;\n        // loop invariant: loop until all posting iterators are either\n        // positioned directly at the target doc ID (in which case, `found`\n        // is set to that doc ID) or beyond the target doc ID. If none of the\n        // iterators contain the target doc ID, then `found` remains `None`.\n        loop {\n            // This unwrap is OK because we're only here if we have a\n            // non-empty queue.\n            let mut postings = self.queue.peek_mut().unwrap();\n            if postings.docid().map_or(true, |x| x >= target_docid) {\n                found = found || Some(target_docid) == postings.docid();\n                // This is the smallest posting iterator, which means all\n                // iterators are now either at or beyond target_docid.\n                break;\n            }\n            // Skip through this iterator until we're at or beyond the target\n            // doc ID.\n            while postings.docid().map_or(false, |x| x < target_docid) {\n                postings.next();\n            }\n            found = found || Some(target_docid) == postings.docid();\n        }\n        if !found {\n            return None;\n        }\n        // We're here if we found our target doc ID, which means at least one\n        // posting iterator is pointing to the doc ID and it is necessarily\n        // the minimum doc ID of all the posting iterators in this disjunction.\n        // Therefore, advance such that all posting iterators are beyond the\n        // target doc ID.\n        //\n        // (If we didn't find the target doc ID, then the loop invariant above\n        // guarantees that we are already passed the target doc ID.)\n        self.next()\n    }\n}\n\nimpl<'i> Iterator for Disjunction<'i> {\n    type Item = Scored<DocID>;\n\n    fn next(&mut self) -> Option<Scored<DocID>> {\n        if self.is_done {\n            return None;\n        }\n        // Find our next matching ngram.\n        let mut scored1 = {\n            // This unwrap is OK because we're only here if we have a\n            // non-empty queue.\n            let mut postings = self.queue.peek_mut().unwrap();\n            match postings.score() {\n                None => {\n                    self.is_done = true;\n                    return None;\n                }\n                Some(scored) => {\n                    postings.next();\n                    scored\n                }\n            }\n        };\n        // Discover if any of the other posting iterators also match this\n        // ngram.\n        loop {\n            // This unwrap is OK because we're only here if we have a\n            // non-empty queue.\n            let mut postings = self.queue.peek_mut().unwrap();\n            match postings.score() {\n                None => break,\n                Some(scored2) => {\n                    // If the smallest posting iterator isn't equivalent to\n                    // the doc ID found above, then we've found all of the\n                    // matching terms for this doc ID that we'll find.\n                    if scored1.value() != scored2.value() {\n                        break;\n                    }\n                    scored1 = scored1.map_score(|s| s + scored2.score());\n                    postings.next();\n                }\n            }\n        }\n        // Some of our scorers are more convenient to compute at the\n        // disjunction level rather than at the term level.\n        if let NameScorer::Jaccard = self.scorer {\n            // When using Jaccard, the score returned by the posting\n            // iterator is always 1. Thus, `scored.score` represents the\n            // total number of terms that matched this document. In other\n            // words, it is the cardinality of the intersection of terms\n            // between the query and our candidate, `|A ∩ B|`.\n            //\n            // `query_len` represents the total number of terms in our query\n            // (not just the number of terms in this disjunction!), and\n            // `doc_len` represents the total number of terms in our candidate.\n            // Thus, since `|A u B| = |A| + |B| - |A ∩ B|`, we have that\n            // `|A u B| = query_len + doc_len - scored.score`. And finally, the\n            // Jaccard index is `|A ∩ B| / |A u B|`.\n            let doc_len = self.index.document_length(*scored1.value()) as f64;\n            let union = self.query_len + doc_len - scored1.score();\n            scored1 = scored1.map_score(|s| s / union);\n        } else if let NameScorer::QueryRatio = self.scorer {\n            // This is like Jaccard, but our score is computely purely as the\n            // ratio of query terms that matched this document.\n            scored1 = scored1.map_score(|s| s / self.query_len)\n        }\n        Some(scored1)\n    }\n}\n\n/// An iterator over a postings list for a specific ngram.\n///\n/// A postings list is a sequence of pairs, where each pair has a document\n/// ID and a frequency. The document ID indicates that the ngram is in the\n/// text indexed for that ID, and the frequency counts the number of times\n/// that ngram occurs in the document.\n///\n/// To save space, each pair is encoded using 32 bits. Frequencies are capped\n/// at a maximum of 15, which fit into the high 4 bits. The low 28 bits contain\n/// the doc ID.\n///\n/// The postings list starts with a single 32-bit little endian\n/// integer that represents the document frequency of the ngram. This in turn\n/// determines how many pairs to read. In other words, a posting list is a\n/// length prefixed array of 32 bit little endian integer values.\n///\n/// This type is intended to be used in a max-heap, and orients its Ord\n/// definition such that the heap becomes a min-heap. The ordering criteria\n/// is derived from only the docid.\n#[derive(Clone)]\nstruct PostingIter<'i> {\n    /// A handle to the underlying index.\n    index: &'i IndexReader,\n    /// The scoring function to use.\n    scorer: NameScorer,\n    /// The number of times the term for these postings appeared in the\n    /// original query. This increases the score proportionally.\n    count: f64,\n    /// The raw bytes of the posting list. The number of bytes is\n    /// exactly equivalent to `4 * document-frequency(ngram)`, where\n    /// `document-frequency(ngram)` is the total number of documents in which\n    /// `ngram` occurs.\n    ///\n    /// This does not include the length prefix.\n    postings: &'i [u8],\n    /// The document frequency of this term.\n    len: usize,\n    /// The current posting. This is `None` once this iterator is exhausted.\n    posting: Option<Posting>,\n    /// A docid used for sorting postings. When the iterator is exhausted,\n    /// this is greater than the maximum doc id. Otherwise, this is always\n    /// equivalent to posting.docid.\n    ///\n    /// We do this for efficiency by avoiding going through the optional\n    /// Posting.\n    docid: DocID,\n    /// The OkapiBM25 IDF score. This is invariant across all items in a\n    /// posting list, so we compute it once at construction. This saves a\n    /// call to `log` for every doc ID visited.\n    okapi_idf: f64,\n}\n\n/// A single entry in a posting list.\n#[derive(Clone, Copy, Debug)]\nstruct Posting {\n    /// The document id.\n    docid: DocID,\n    /// The frequency, i.e., the number of times the ngram occurred in the\n    /// document identified by the docid.\n    frequency: u32,\n}\n\nimpl Posting {\n    /// Read the next posting pair (doc ID and frequency) from the given\n    /// postings list. If the list is empty, then return `None`.\n    fn read(slice: &[u8]) -> Option<Posting> {\n        if slice.is_empty() {\n            None\n        } else {\n            let v = read_le_u32(slice);\n            Some(Posting { docid: v & MAX_DOC_ID, frequency: v >> 28 })\n        }\n    }\n}\n\nimpl<'i> PostingIter<'i> {\n    /// Create a new posting iterator for the given term in the given index.\n    /// Scores will be computed with the given scoring function.\n    ///\n    /// `count` should be the number of times this term occurred in the\n    /// original query string.\n    fn new(\n        index: &'i IndexReader,\n        scorer: NameScorer,\n        count: usize,\n        term: &str,\n    ) -> PostingIter<'i> {\n        let mut postings = &*index.postings;\n        let offset = match index.ngram.get(term.as_bytes()) {\n            Some(offset) => offset as usize,\n            None => {\n                // If the term isn't in the index, then return an exhausted\n                // iterator.\n                return PostingIter {\n                    index,\n                    scorer,\n                    count: 0.0,\n                    postings: &[],\n                    len: 0,\n                    posting: None,\n                    docid: MAX_DOC_ID + 1,\n                    okapi_idf: 0.0,\n                };\n            }\n        };\n        postings = &postings[offset..];\n        let len = read_le_u32(postings) as usize;\n        postings = &postings[4..];\n\n        let corpus_count = index.config.num_documents as f64;\n        let df = len as f64;\n        let okapi_idf = (1.0 + (corpus_count - df + 0.5) / (df + 0.5)).log2();\n        let mut it = PostingIter {\n            index,\n            scorer,\n            count: count as f64,\n            postings: &postings[..4 * len],\n            len,\n            posting: None,\n            docid: 0,\n            okapi_idf,\n        };\n        // Advance to the first posting.\n        it.next();\n        it\n    }\n\n    /// Return the current posting. If this iterator has been exhausted, then\n    /// this returns `None`.\n    fn posting(&self) -> Option<Posting> {\n        self.posting\n    }\n\n    /// Returns the document frequency for the term corresponding to these\n    /// postings.\n    fn len(&self) -> usize {\n        self.len\n    }\n\n    /// Return the current document ID. If this iterator has been exhausted,\n    /// then this returns `None`.\n    fn docid(&self) -> Option<DocID> {\n        self.posting().map(|p| p.docid)\n    }\n\n    /// Return the score with the current document ID. If this iterator has\n    /// been exhausted, then this returns `None`.\n    fn score(&self) -> Option<Scored<DocID>> {\n        match self.scorer {\n            NameScorer::OkapiBM25 => self.score_okapibm25(),\n            NameScorer::TFIDF => self.score_tfidf(),\n            NameScorer::Jaccard => self.score_jaccard(),\n            NameScorer::QueryRatio => self.score_query_ratio(),\n        }\n        .map(|scored| scored.map_score(|s| s * self.count))\n    }\n\n    /// Score the current doc ID using Okapi BM25. It's similarish to TF-IDF,\n    /// but uses a document length normalization term.\n    fn score_okapibm25(&self) -> Option<Scored<DocID>> {\n        let post = match self.posting() {\n            None => return None,\n            Some(post) => post,\n        };\n\n        let k1 = 1.2;\n        let b = 0.75;\n        let doc_len = self.index.document_length(post.docid);\n        let norm = (doc_len as f64) / self.index.config.avg_document_len;\n        let tf = post.frequency as f64;\n\n        let num = tf * (k1 + 1.0);\n        let den = tf + k1 * (1.0 - b + b * norm);\n        let score = (num / den) * self.okapi_idf;\n        let capped = if score < 0.0 { 0.0 } else { score };\n        Some(Scored::new(post.docid).with_score(capped))\n    }\n\n    /// Score the current doc ID using the traditional TF-IDF ranking function.\n    fn score_tfidf(&self) -> Option<Scored<DocID>> {\n        let post = match self.posting() {\n            None => return None,\n            Some(post) => post,\n        };\n\n        let corpus_docs = self.index.config.num_documents as f64;\n        let term_docs = self.len as f64;\n        let tf = post.frequency as f64;\n        let idf = (corpus_docs / (1.0 + term_docs)).log2();\n        let score = tf * idf;\n        Some(Scored::new(post.docid).with_score(score))\n    }\n\n    /// Score the current doc ID using the Jaccard index, which measures the\n    /// overlap between two sets.\n    ///\n    /// Note that this always returns `1.0`. The Jaccard index itself must be\n    /// computed by the disjunction scorer.\n    fn score_jaccard(&self) -> Option<Scored<DocID>> {\n        self.posting().map(|p| Scored::new(p.docid).with_score(1.0))\n    }\n\n    /// Score the current doc ID using the ratio of terms in the query that\n    /// matched the terms in this doc ID.\n    ///\n    /// Note that this always returns `1.0`. The query ratio itself must be\n    /// computed by the disjunction scorer.\n    fn score_query_ratio(&self) -> Option<Scored<DocID>> {\n        self.posting().map(|p| Scored::new(p.docid).with_score(1.0))\n    }\n}\n\nimpl<'i> Iterator for PostingIter<'i> {\n    type Item = Posting;\n\n    fn next(&mut self) -> Option<Posting> {\n        self.posting = match Posting::read(self.postings) {\n            None => {\n                self.docid = MAX_DOC_ID + 1;\n                None\n            }\n            Some(p) => {\n                self.postings = &self.postings[4..];\n                self.docid = p.docid;\n                Some(p)\n            }\n        };\n        self.posting\n    }\n}\n\nimpl<'i> Eq for PostingIter<'i> {}\n\nimpl<'i> PartialEq for PostingIter<'i> {\n    fn eq(&self, other: &PostingIter<'i>) -> bool {\n        self.docid == other.docid\n    }\n}\n\nimpl<'i> Ord for PostingIter<'i> {\n    fn cmp(&self, other: &PostingIter<'i>) -> cmp::Ordering {\n        // std::collections::BinaryHeap is a max-heap and we need a\n        // min-heap, so write this as-if it were a max-heap, then reverse it.\n        // Note that exhausted searchers should always have the lowest\n        // priority, and therefore, be considered maximal.\n        self.docid.cmp(&other.docid).reverse()\n    }\n}\n\nimpl<'i> PartialOrd for PostingIter<'i> {\n    fn partial_cmp(&self, other: &PostingIter<'i>) -> Option<cmp::Ordering> {\n        Some(self.cmp(other))\n    }\n}\n\n/// A writer for indexing names to disk.\n///\n/// A writer opens and writes to several files simultaneously, which keeps the\n/// implementation simple.\n///\n/// The index writer cannot stream the postings or term index, since the term\n/// index requires its ngrams to be inserted in sorted order. Postings lists\n/// are written as length prefixed sequences, so we need to know the lengths\n/// of all our postings lists before writing them.\npub struct IndexWriter {\n    /// A builder for the ngram term index.\n    ///\n    /// This isn't used until the caller indicates that it is done indexing\n    /// names. At which point, we insert all ngrams into the FST in sorted\n    /// order. Each ngram is mapped to the beginning of its correspond\n    /// postings list.\n    ngram: fst::MapBuilder<io::BufWriter<File>>,\n    /// The type of ngram extraction to use.\n    ngram_type: NgramType,\n    /// The size of ngrams to generate.\n    ngram_size: usize,\n    /// A writer for postings lists.\n    ///\n    /// This isn't written to until the caller indicates that it is done\n    /// indexing names. At which point, every posting list is written as a\n    /// length prefixed array, in the same order that terms are written to the\n    /// term index.\n    postings: CursorWriter<io::BufWriter<File>>,\n    /// A map from document ID to name ID. This is written to in a streaming\n    /// fashion during indexing. The ID map consists of N 64-bit little\n    /// endian integers, where N is the total number of names indexed.\n    ///\n    /// The document ID (the position in this map) is a unique internal\n    /// identifier assigned to each name, while the name ID is an identifier\n    /// provided by the caller. Multiple document IDs may map to the same\n    /// name ID (e.g., for indexing alternate names).\n    idmap: CursorWriter<io::BufWriter<File>>,\n    /// A map from document ID to document length, where the length corresponds\n    /// to the number of ngrams in the document. The map consists of N 16-bit\n    /// little endian integers, where N is the total number of names indexed.\n    ///\n    /// The document lengths are used at query time as normalization\n    /// parameters. They are written in a streaming fashion during the indexing\n    /// process.\n    norms: CursorWriter<io::BufWriter<File>>,\n    /// A JSON formatted configuration file that includes some aggregate\n    /// statistics (such as the average document length, in ngrams) and the\n    /// ngram configuration. The ngram configuration in particular is used at\n    /// query time to make sure that query-time uses the same analysis as\n    /// index-time.\n    ///\n    /// This is written at the end of the indexing process.\n    config: CursorWriter<io::BufWriter<File>>,\n    /// An in-memory map from ngram to its corresponding postings list. Once\n    /// indexing is done, this is written to disk via the FST term index and\n    /// postings list writers documented above.\n    terms: FnvHashMap<String, Postings>,\n    /// The next document ID, starting at 0. Each name added gets assigned its\n    /// own unique document ID. Queries read document IDs from the postings\n    /// list, but are mapped back to name IDs using the `idmap` before being\n    /// returned to the caller.\n    next_docid: DocID,\n    /// The average document length, in ngrams, for every name indexed. This is\n    /// used along with document lengths to compute normalization terms for\n    /// scoring at query time.\n    avg_document_len: f64,\n}\n\n/// A single postings list.\n#[derive(Clone, Debug, Default)]\nstruct Postings {\n    /// A sorted list of postings, in order of ascending document IDs.\n    list: Vec<Posting>,\n}\n\nimpl IndexWriter {\n    /// Open an index for writing to the given directory. Any previous name\n    /// index in the given directory is overwritten.\n    ///\n    /// The given ngram configuration is used to transform all indexed names\n    /// into terms for the inverted index.\n    pub fn open<P: AsRef<Path>>(\n        dir: P,\n        ngram_type: NgramType,\n        ngram_size: usize,\n    ) -> Result<IndexWriter> {\n        let dir = dir.as_ref();\n\n        let ngram = fst_map_builder_file(dir.join(NGRAM))?;\n        let postings = CursorWriter::from_path(dir.join(POSTINGS))?;\n        let idmap = CursorWriter::from_path(dir.join(IDMAP))?;\n        let norms = CursorWriter::from_path(dir.join(NORMS))?;\n        let config = CursorWriter::from_path(dir.join(CONFIG))?;\n        Ok(IndexWriter {\n            ngram,\n            ngram_type,\n            ngram_size,\n            postings,\n            idmap,\n            norms,\n            config,\n            terms: FnvHashMap::default(),\n            next_docid: 0,\n            avg_document_len: 0.0,\n        })\n    }\n\n    /// Finish writing names and serialize the index to disk.\n    pub fn finish(mut self) -> Result<()> {\n        let num_docs = self.num_docs();\n        let mut ngram_to_postings: Vec<(String, Postings)> =\n            self.terms.into_iter().collect();\n        // We could use a BTreeMap and get out our keys in sorted order, but\n        // the overhead of inserting into the BTreeMap dwarfs the savings we\n        // get from pre-sorted keys.\n        ngram_to_postings.sort_by(|&(ref t1, _), &(ref t2, _)| t1.cmp(t2));\n\n        for (term, postings) in ngram_to_postings {\n            let pos = self.postings.position() as u64;\n            self.ngram.insert(term.as_bytes(), pos).map_err(Error::fst)?;\n            self.postings\n                .write_u32(postings.list.len() as u32)\n                .map_err(Error::io)?;\n            for posting in postings.list {\n                let freq = cmp::min(15, posting.frequency);\n                let v = (freq << 28) | posting.docid;\n                self.postings.write_u32(v).map_err(Error::io)?;\n            }\n        }\n\n        serde_json::to_writer_pretty(\n            &mut self.config,\n            &Config {\n                ngram_type: self.ngram_type,\n                ngram_size: self.ngram_size,\n                avg_document_len: self.avg_document_len,\n                num_documents: num_docs as u64,\n            },\n        )\n        .map_err(|e| Error::config(e.to_string()))?;\n        self.ngram.finish().map_err(Error::fst)?;\n        self.idmap.flush().map_err(Error::io)?;\n        self.postings.flush().map_err(Error::io)?;\n        self.norms.flush().map_err(Error::io)?;\n        self.config.flush().map_err(Error::io)?;\n        Ok(())\n    }\n\n    /// Inserts the given name to this index, and associates it with the\n    /// provided `NameID`. Multiple names may be associated with the same\n    /// `NameID`.\n    pub fn insert(&mut self, name_id: NameID, name: &str) -> Result<()> {\n        let docid = self.next_docid(name_id)?;\n        let name = normalize_query(name);\n        let mut count = 0u16; // document length in number of ngrams\n        self.ngram_type.clone().iter(self.ngram_size, &name, |ngram| {\n            self.insert_term(docid, ngram);\n            // If a document length exceeds 2^16, then it is far too long for\n            // a name anyway, so we cap it at 2^16.\n            count = count.saturating_add(1);\n        });\n        // Update our mean document length (in ngrams).\n        self.avg_document_len +=\n            (count as f64 - self.avg_document_len) / (self.num_docs() as f64);\n        // Write the document length to disk, which is used as a normalization\n        // term for some scorers (like Okapi-BM25).\n        self.norms.write_u16(count).map_err(Error::io)?;\n        Ok(())\n    }\n\n    /// Add a single term that is part of a name identified by the given docid.\n    /// This updates the postings for this term, or creates a new posting if\n    /// this is the first time this term has been seen.\n    fn insert_term(&mut self, docid: DocID, term: &str) {\n        if let Some(posts) = self.terms.get_mut(term) {\n            posts.posting(docid).frequency += 1;\n            return;\n        }\n        let mut list = Postings::default();\n        list.posting(docid).frequency = 1;\n        self.terms.insert(term.to_string(), list);\n    }\n\n    /// Retrieve a fresh doc id, and associate it with the given name id.\n    fn next_docid(&mut self, name_id: NameID) -> Result<DocID> {\n        let docid = self.next_docid;\n        self.idmap.write_u64(name_id).map_err(Error::io)?;\n        self.next_docid = match self.next_docid.checked_add(1) {\n            None => bug!(\"exhausted doc ids\"),\n            Some(next_docid) => next_docid,\n        };\n        if self.next_docid > MAX_DOC_ID {\n            let max = MAX_DOC_ID + 1; // docids are 0-indexed\n            bug!(\"exceeded maximum number of names ({})\", max);\n        }\n        Ok(docid)\n    }\n\n    /// Return the total number of documents have been assigned doc ids.\n    fn num_docs(&self) -> u32 {\n        self.next_docid\n    }\n}\n\nimpl Postings {\n    /// Return a mutable reference to the posting for the given docid. If one\n    /// doesn't exist, then create one (with a zero frequency) and return it.\n    fn posting(&mut self, docid: DocID) -> &mut Posting {\n        if self.list.last().map_or(true, |x| x.docid != docid) {\n            self.list.push(Posting { docid, frequency: 0 });\n        }\n        // This unwrap is OK because if the list was empty when this method was\n        // called, then we added an element above, and is thus now non-empty.\n        self.list.last_mut().unwrap()\n    }\n}\n\n/// The type of scorer that the name index should use.\n///\n/// The default is OkapiBM25. If you aren't sure which scorer to use, then\n/// stick with the default.\n#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]\npub enum NameScorer {\n    /// OkapiBM25 is a TF-IDF-like ranking function, which takes name length\n    /// into account.\n    OkapiBM25,\n    /// TFIDF is the traditional TF-IDF ranking function, which does not\n    /// incorporate document length.\n    TFIDF,\n    /// Jaccard is a ranking function determined by computing the similarity\n    /// of ngrams between the query and a name in the index. The similarity\n    /// is computed by dividing the number of ngrams in common by the total\n    /// number of distinct ngrams in both the query and the name combined.\n    Jaccard,\n    /// QueryRatio is a ranking function that represents the ratio of query\n    /// terms that matched a name. It is computed by dividing the number of\n    /// ngrams in common by the total number of ngrams in the query only.\n    QueryRatio,\n}\n\nimpl NameScorer {\n    /// Returns a list of strings representing the possible scorer values.\n    pub fn possible_names() -> &'static [&'static str] {\n        &[\"okapibm25\", \"tfidf\", \"jaccard\", \"queryratio\"]\n    }\n\n    /// Return a string representation of this scorer.\n    ///\n    /// The string returned can be parsed back into a `NameScorer`.\n    pub fn as_str(&self) -> &'static str {\n        match *self {\n            NameScorer::OkapiBM25 => \"okapibm25\",\n            NameScorer::TFIDF => \"tfidf\",\n            NameScorer::Jaccard => \"jaccard\",\n            NameScorer::QueryRatio => \"queryratio\",\n        }\n    }\n}\n\nimpl Default for NameScorer {\n    fn default() -> NameScorer {\n        NameScorer::OkapiBM25\n    }\n}\n\nimpl fmt::Display for NameScorer {\n    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {\n        write!(f, \"{}\", self.as_str())\n    }\n}\n\nimpl FromStr for NameScorer {\n    type Err = Error;\n\n    fn from_str(s: &str) -> Result<NameScorer> {\n        match s {\n            \"okapibm25\" => Ok(NameScorer::OkapiBM25),\n            \"tfidf\" => Ok(NameScorer::TFIDF),\n            \"jaccard\" => Ok(NameScorer::Jaccard),\n            \"queryratio\" => Ok(NameScorer::QueryRatio),\n            unk => Err(Error::unknown_scorer(unk)),\n        }\n    }\n}\n\n/// The style of ngram extraction to use.\n///\n/// The same style of ngram extraction is always used at index time and at\n/// query time.\n///\n/// Each ngram type uses the ngram size configuration differently.\n///\n/// All ngram styles used Unicode codepoints as the definition of a character.\n/// For example, a 3-gram might contain up to 4 bytes, if it contains 3 Unicode\n/// codepoints that each require 4 UTF-8 code units.\n#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]\npub enum NgramType {\n    /// A windowing ngram.\n    ///\n    /// This is the tradition style of ngram, where sliding window of size\n    /// `N` is moved across the entire content to be index. For example, the\n    /// 3-grams for the string `homer` are hom, ome and mer.\n    #[serde(rename = \"window\")]\n    Window,\n    /// An edge ngram.\n    ///\n    /// This style of ngram produces ever longer ngrams, where each ngram is\n    /// anchored to the start of a word. Words are determined simply by\n    /// splitting whitespace.\n    ///\n    /// For example, the edge ngrams of `homer simpson`, where the max ngram\n    /// size is 5, would be: hom, home, homer, sim, simp, simps. Generally,\n    /// for this ngram type, one wants to use a large maximum ngram size.\n    /// Perhaps somewhere close to the maximum number of ngrams in any word\n    /// in the corpus.\n    ///\n    /// Note that there is no way to set the minimum ngram size (which is 3).\n    #[serde(rename = \"edge\")]\n    Edge,\n}\n\n/// The minimum size of an ngram emitted by the edge ngram iterator.\nconst MIN_EDGE_NGRAM_SIZE: usize = 3;\n\nimpl NgramType {\n    /// Return all possible ngram types.\n    pub fn possible_names() -> &'static [&'static str] {\n        &[\"window\", \"edge\"]\n    }\n\n    /// Return a string representation of this type.\n    pub fn as_str(&self) -> &'static str {\n        match *self {\n            NgramType::Window => \"window\",\n            NgramType::Edge => \"edge\",\n        }\n    }\n\n    /// Execute the given function over each ngram in the text provided using\n    /// the given size configuration.\n    ///\n    /// We don't use normal Rust iterators here because an internal iterator\n    /// is much easier to implement.\n    fn iter<'t, F: FnMut(&'t str)>(&self, size: usize, text: &'t str, f: F) {\n        match *self {\n            NgramType::Window => NgramType::iter_window(size, text, f),\n            NgramType::Edge => NgramType::iter_edge(size, text, f),\n        }\n    }\n\n    fn iter_window<'t, F: FnMut(&'t str)>(\n        size: usize,\n        text: &'t str,\n        mut f: F,\n    ) {\n        if size == 0 {\n            return;\n        }\n        let end_skip = text.chars().take(size).count().saturating_sub(1);\n        let start = text.char_indices();\n        let end = text.char_indices().skip(end_skip);\n        for ((s, _), (e, c)) in start.zip(end) {\n            f(&text[s..e + c.len_utf8()]);\n        }\n    }\n\n    fn iter_edge<'t, F: FnMut(&'t str)>(\n        max_size: usize,\n        text: &'t str,\n        mut f: F,\n    ) {\n        if max_size == 0 {\n            return;\n        }\n        for word in text.split_whitespace() {\n            let end_skip = word\n                .chars()\n                .take(MIN_EDGE_NGRAM_SIZE)\n                .count()\n                .saturating_sub(1);\n            let mut size = end_skip + 1;\n            for (end, c) in word.char_indices().skip(end_skip) {\n                f(&word[..end + c.len_utf8()]);\n                size += 1;\n                if size > max_size {\n                    break;\n                }\n            }\n        }\n    }\n}\n\nimpl Default for NgramType {\n    fn default() -> NgramType {\n        NgramType::Window\n    }\n}\n\nimpl fmt::Display for NgramType {\n    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {\n        write!(f, \"{}\", self.as_str())\n    }\n}\n\nimpl FromStr for NgramType {\n    type Err = Error;\n\n    fn from_str(s: &str) -> Result<NgramType> {\n        match s {\n            \"window\" => Ok(NgramType::Window),\n            \"edge\" => Ok(NgramType::Edge),\n            unk => Err(Error::unknown_ngram_type(unk)),\n        }\n    }\n}\n\nfn normalize_query(s: &str) -> String {\n    // We might consider doing Unicode normalization here, but it probably\n    // doesn't matter too much on a predominantly ASCII data set.\n    s.to_lowercase()\n}\n\nfn read_le_u32(slice: &[u8]) -> u32 {\n    u32::from_le_bytes(slice[..4].try_into().unwrap())\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    use crate::index::tests::TestContext;\n\n    // Test the actual name index.\n\n    /// Creates a name index, where each name provided is assigned its own\n    /// unique ID, starting at 0.\n    fn create_index(index_dir: &Path, names: &[&str]) -> IndexReader {\n        let mut wtr =\n            IndexWriter::open(index_dir, NgramType::Window, 3).unwrap();\n        for (i, name) in names.iter().enumerate() {\n            wtr.insert(i as u64, name).unwrap();\n        }\n        wtr.finish().unwrap();\n\n        IndexReader::open(index_dir).unwrap()\n    }\n\n    /// Build a name query, and disable the dynamic stop word detection.\n    ///\n    /// It would be nice to test the stop word detection, but it makes writing\n    /// unit tests very difficult unfortunately.\n    fn name_query(name: &str) -> NameQuery {\n        NameQuery::new(name).with_stop_word_ratio(0.0)\n    }\n\n    fn ids(results: &[Scored<NameID>]) -> Vec<NameID> {\n        let mut ids: Vec<_> = results.iter().map(|r| *r.value()).collect();\n        ids.sort();\n        ids\n    }\n\n    /// Some names involving bruce.\n    const BRUCES: &'static [&'static str] = &[\n        \"Bruce Springsteen\", // 0\n        \"Bruce Kulick\",      // 1\n        \"Bruce Arians\",      // 2\n        \"Bruce Smith\",       // 3\n        \"Bruce Willis\",      // 4\n        \"Bruce Wayne\",       // 5\n        \"Bruce Banner\",      // 6\n    ];\n\n    #[test]\n    fn names_bruces_1() {\n        let ctx = TestContext::new(\"small\");\n        let idx = create_index(ctx.index_dir(), BRUCES);\n        let query = name_query(\"bruce\");\n        let results = idx.search(&query).into_vec();\n\n        // This query matches everything.\n        assert_eq!(results.len(), 7);\n        // The top two hits are the shortest documents, because of Okapi-BM25's\n        // length normalization.\n        assert_eq!(results[0].score(), 1.0);\n        assert_eq!(results[1].score(), 1.0);\n        assert_eq!(ids(&results[0..2]), vec![3, 5]);\n    }\n\n    #[test]\n    fn names_bruces_2() {\n        let ctx = TestContext::new(\"small\");\n        let idx = create_index(ctx.index_dir(), BRUCES);\n        let query = name_query(\"e w\");\n        let results = idx.search(&query).into_vec();\n\n        // The 'e w' ngram is only in two documents: Bruce Willis and\n        // Bruce Wayne. Since Wayne is shorter than Willis, it should always\n        // be first.\n        assert_eq!(results.len(), 2);\n        assert_eq!(*results[0].value(), 5);\n        assert_eq!(*results[1].value(), 4);\n    }\n\n    #[test]\n    fn names_bruces_3() {\n        let ctx = TestContext::new(\"small\");\n        let idx = create_index(ctx.index_dir(), BRUCES);\n        let query = name_query(\"Springsteen\");\n        let results = idx.search(&query).into_vec();\n\n        assert_eq!(results.len(), 1);\n        assert_eq!(*results[0].value(), 0);\n    }\n\n    #[test]\n    fn names_bruces_4() {\n        let ctx = TestContext::new(\"small\");\n        let idx = create_index(ctx.index_dir(), BRUCES);\n        let query =\n            name_query(\"Springsteen Kulick Arians Smith Willis Wayne Banner\");\n        let results = idx.search(&query).into_vec();\n\n        // This query should hit everything.\n        assert_eq!(results.len(), 7);\n    }\n\n    // Test our various ngram strategies.\n\n    fn ngrams_window(n: usize, text: &str) -> Vec<&str> {\n        let mut grams = vec![];\n        NgramType::Window.iter(n, text, |gram| grams.push(gram));\n        grams\n    }\n\n    fn ngrams_edge(n: usize, text: &str) -> Vec<&str> {\n        let mut grams = vec![];\n        NgramType::Edge.iter(n, text, |gram| grams.push(gram));\n        grams\n    }\n\n    #[test]\n    #[should_panic]\n    fn ngrams_window_zero_banned() {\n        assert_eq!(ngrams_window(0, \"abc\"), vec![\"abc\"]);\n    }\n\n    #[test]\n    fn ngrams_window_weird_sizes() {\n        assert_eq!(\n            ngrams_window(2, \"abcdef\"),\n            vec![\"ab\", \"bc\", \"cd\", \"de\", \"ef\",]\n        );\n        assert_eq!(\n            ngrams_window(1, \"abcdef\"),\n            vec![\"a\", \"b\", \"c\", \"d\", \"e\", \"f\",]\n        );\n        assert_eq!(ngrams_window(2, \"ab\"), vec![\"ab\",]);\n        assert_eq!(ngrams_window(1, \"ab\"), vec![\"a\", \"b\",]);\n        assert_eq!(ngrams_window(1, \"a\"), vec![\"a\",]);\n        assert_eq!(ngrams_window(1, \"\"), Vec::<&str>::new());\n    }\n\n    #[test]\n    fn ngrams_window_ascii() {\n        assert_eq!(\n            ngrams_window(3, \"abcdef\"),\n            vec![\"abc\", \"bcd\", \"cde\", \"def\",]\n        );\n        assert_eq!(ngrams_window(3, \"abcde\"), vec![\"abc\", \"bcd\", \"cde\",]);\n        assert_eq!(ngrams_window(3, \"abcd\"), vec![\"abc\", \"bcd\",]);\n        assert_eq!(ngrams_window(3, \"abc\"), vec![\"abc\",]);\n        assert_eq!(ngrams_window(3, \"ab\"), vec![\"ab\",]);\n        assert_eq!(ngrams_window(3, \"a\"), vec![\"a\",]);\n        assert_eq!(ngrams_window(3, \"\"), Vec::<&str>::new());\n    }\n\n    #[test]\n    fn ngrams_window_non_ascii() {\n        assert_eq!(\n            ngrams_window(3, \"αβγφδε\"),\n            vec![\"αβγ\", \"βγφ\", \"γφδ\", \"φδε\",]\n        );\n        assert_eq!(ngrams_window(3, \"αβγφδ\"), vec![\"αβγ\", \"βγφ\", \"γφδ\",]);\n        assert_eq!(ngrams_window(3, \"αβγφ\"), vec![\"αβγ\", \"βγφ\",]);\n        assert_eq!(ngrams_window(3, \"αβγ\"), vec![\"αβγ\",]);\n        assert_eq!(ngrams_window(3, \"αβ\"), vec![\"αβ\",]);\n        assert_eq!(ngrams_window(3, \"α\"), vec![\"α\",]);\n    }\n\n    #[test]\n    fn ngrams_edge_ascii() {\n        assert_eq!(\n            ngrams_edge(5, \"homer simpson\"),\n            vec![\"hom\", \"home\", \"homer\", \"sim\", \"simp\", \"simps\",]\n        );\n        assert_eq!(ngrams_edge(5, \"h\"), vec![\"h\",]);\n        assert_eq!(ngrams_edge(5, \"ho\"), vec![\"ho\",]);\n        assert_eq!(ngrams_edge(5, \"hom\"), vec![\"hom\",]);\n        assert_eq!(ngrams_edge(5, \"home\"), vec![\"hom\", \"home\",]);\n    }\n\n    #[test]\n    fn ngrams_edge_non_ascii() {\n        assert_eq!(\n            ngrams_edge(5, \"δεαβγφδε δε\"),\n            vec![\"δεα\", \"δεαβ\", \"δεαβγ\", \"δε\",]\n        );\n    }\n}\n"
  },
  {
    "path": "imdb-index/src/index/rating.rs",
    "content": "use std::path::Path;\n\nuse fst::{IntoStreamer, Streamer};\nuse memmap::Mmap;\n\nuse crate::error::{Error, Result};\nuse crate::record::Rating;\nuse crate::util::{\n    csv_file, fst_set_builder_file, fst_set_file, IMDB_RATINGS,\n};\n\n/// The name of the ratings index file.\n///\n/// The ratings index maps IMDb title ID to their average rating and number of\n/// votes. The index is itself an FST set, where all keys begin with the IMDb\n/// title ID, and also contain the average rating and number votes. Thus, a\n/// lookup is accomplished via a range query on the title ID without needing\n/// to consult the original CSV data.\nconst RATINGS: &str = \"ratings.fst\";\n\n/// An index for ratings, which supports looking up ratings/votes for IMDb\n/// titles efficiently.\n#[derive(Debug)]\npub struct Index {\n    idx: fst::Set<Mmap>,\n}\n\nimpl Index {\n    /// Open a rating index from the given index directory.\n    pub fn open<P: AsRef<Path>>(index_dir: P) -> Result<Index> {\n        Ok(Index {\n            // We claim it is safe to open the following memory map because we\n            // don't mutate them and no other process (should) either.\n            idx: unsafe { fst_set_file(index_dir.as_ref().join(RATINGS))? },\n        })\n    }\n\n    /// Create a rating index from the given IMDb data directory, and write it\n    /// to the given index directory. If a rating index already exists, then it\n    /// is overwritten.\n    pub fn create<P1: AsRef<Path>, P2: AsRef<Path>>(\n        data_dir: P1,\n        index_dir: P2,\n    ) -> Result<Index> {\n        let data_dir = data_dir.as_ref();\n        let index_dir = index_dir.as_ref();\n\n        let mut buf = vec![];\n        let mut count = 0u64;\n        let mut idx = fst_set_builder_file(index_dir.join(RATINGS))?;\n        let mut rdr = csv_file(data_dir.join(IMDB_RATINGS))?;\n        for result in rdr.deserialize() {\n            let record: Rating = result.map_err(Error::csv)?;\n\n            buf.clear();\n            write_rating(&record, &mut buf)?;\n            idx.insert(&buf).map_err(Error::fst)?;\n            count += 1;\n        }\n        idx.finish().map_err(Error::fst)?;\n\n        log::info!(\"{} ratings indexed\", count);\n        Index::open(index_dir)\n    }\n\n    /// Return the rating information (which includes the actual rating and\n    /// the number of votes associated with that rating) for the given IMDb\n    /// identifier. If no rating information exists for the given ID, then\n    /// `None` is returned.\n    pub fn rating(&self, id: &[u8]) -> Result<Option<Rating>> {\n        let mut upper = id.to_vec();\n        upper.push(0xFF);\n\n        let mut stream = self.idx.range().ge(id).le(upper).into_stream();\n        while let Some(rating_bytes) = stream.next() {\n            return Ok(Some(read_rating(rating_bytes)?));\n        }\n        Ok(None)\n    }\n}\n\nfn read_rating(bytes: &[u8]) -> Result<Rating> {\n    let nul = match bytes.iter().position(|&b| b == 0) {\n        Some(nul) => nul,\n        None => bug!(\"could not find nul byte\"),\n    };\n    let id = match String::from_utf8(bytes[..nul].to_vec()) {\n        Err(err) => bug!(\"rating id invalid UTF-8: {}\", err),\n        Ok(tvshow_id) => tvshow_id,\n    };\n\n    let i = nul + 1;\n    Ok(Rating {\n        id,\n        rating: read_rating_value(&bytes[i..])?,\n        votes: read_votes_value(&bytes[i + 4..])?,\n    })\n}\n\nfn write_rating(rat: &Rating, buf: &mut Vec<u8>) -> Result<()> {\n    if rat.id.as_bytes().iter().any(|&b| b == 0) {\n        bug!(\"unsupported rating id (with NUL byte) for {:?}\", rat);\n    }\n\n    buf.extend_from_slice(rat.id.as_bytes());\n    buf.push(0x00);\n    write_rating_value(rat.rating, buf);\n    write_votes_value(rat.votes, buf);\n    Ok(())\n}\n\nfn read_votes_value(slice: &[u8]) -> Result<u32> {\n    if slice.len() < 4 {\n        bug!(\"not enough bytes to read votes value\");\n    }\n    Ok(u32::from_be_bytes(slice[..4].try_into().unwrap()))\n}\n\nfn write_votes_value(votes: u32, buf: &mut Vec<u8>) {\n    buf.extend_from_slice(&votes.to_be_bytes())\n}\n\nfn read_rating_value(slice: &[u8]) -> Result<f32> {\n    if slice.len() < 4 {\n        bug!(\"not enough bytes to read rating value\");\n    }\n    Ok(f32::from_be_bytes(slice[..4].try_into().unwrap()))\n}\n\nfn write_rating_value(rating: f32, buf: &mut Vec<u8>) {\n    buf.extend_from_slice(&rating.to_be_bytes())\n}\n\n#[cfg(test)]\nmod tests {\n    use super::Index;\n    use crate::index::tests::TestContext;\n\n    #[test]\n    fn basics() {\n        let ctx = TestContext::new(\"small\");\n        let idx = Index::create(ctx.data_dir(), ctx.index_dir()).unwrap();\n\n        let rat = idx.rating(b\"tt0000001\").unwrap().unwrap();\n        assert_eq!(rat.rating, 5.8);\n        assert_eq!(rat.votes, 1356);\n\n        assert!(idx.rating(b\"tt9999999\").unwrap().is_none());\n    }\n}\n"
  },
  {
    "path": "imdb-index/src/index/tests.rs",
    "content": "use std::path::{Path, PathBuf};\n\n/// Create an error from a format!-like syntax.\n#[macro_export]\nmacro_rules! err {\n    ($($tt:tt)*) => {\n        Box::<dyn std::error::Error>::from(format!($($tt)*))\n    }\n}\n\n/// A convenient result type alias.\npub type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;\n\n/// A simple test context that makes it convenient to create an index.\n///\n/// Each test context has an IMDb data directory (which usually has only a\n/// subset of the actual data) and an index directory (which starts empty by\n/// default).\n#[derive(Debug)]\npub struct TestContext {\n    _tmpdir: TempDir,\n    data_dir: PathBuf,\n    index_dir: PathBuf,\n}\n\nimpl TestContext {\n    /// Create a new test context using the test data set name given.\n    ///\n    /// Test data sets can be found in the `data/test` directory in this\n    /// repository's root. Data set names are the names of sub-directories of\n    /// `data`.\n    pub fn new(name: &str) -> TestContext {\n        let tmpdir = TempDir::new(\"imdb-rename-test-index\").unwrap();\n        let data_dir = PathBuf::from(\"../data/test\").join(name);\n        let index_dir = tmpdir.path().to_path_buf();\n        TestContext { _tmpdir: tmpdir, data_dir, index_dir }\n    }\n\n    /// Return the path to the data directory for this context.\n    pub fn data_dir(&self) -> &Path {\n        &self.data_dir\n    }\n\n    /// Return the path to the index directory for this context.\n    pub fn index_dir(&self) -> &Path {\n        &self.index_dir\n    }\n}\n\n/// A simple wrapper for creating a temporary directory that is automatically\n/// deleted when it's dropped.\n///\n/// We use this in lieu of tempfile because tempfile brings in too many\n/// dependencies.\n#[derive(Debug)]\npub struct TempDir(PathBuf);\n\nimpl Drop for TempDir {\n    fn drop(&mut self) {\n        std::fs::remove_dir_all(&self.0).unwrap();\n    }\n}\n\nimpl TempDir {\n    /// Create a new empty temporary directory under the system's configured\n    /// temporary directory.\n    pub fn new(prefix: &str) -> Result<TempDir> {\n        use std::sync::atomic::{AtomicUsize, Ordering};\n\n        static TRIES: usize = 100;\n        static COUNTER: AtomicUsize = AtomicUsize::new(0);\n\n        let tmpdir = std::env::temp_dir();\n        for _ in 0..TRIES {\n            let count = COUNTER.fetch_add(1, Ordering::SeqCst);\n            let path = tmpdir.join(prefix).join(count.to_string());\n            if path.is_dir() {\n                continue;\n            }\n            std::fs::create_dir_all(&path).map_err(|e| {\n                err!(\"failed to create {}: {}\", path.display(), e)\n            })?;\n            return Ok(TempDir(path));\n        }\n        Err(err!(\"failed to create temp dir after {} tries\", TRIES))\n    }\n\n    /// Return the underlying path to this temporary directory.\n    pub fn path(&self) -> &Path {\n        &self.0\n    }\n}\n"
  },
  {
    "path": "imdb-index/src/index/writer.rs",
    "content": "use std::fs::File;\nuse std::io::{self, Write};\nuse std::path::Path;\n\nuse crate::error::Result;\nuse crate::util::create_file;\n\n/// Wraps any writer and records the current position in the writer.\n///\n/// The position recorded always corresponds to the position that the next\n/// byte would be written to.\n#[derive(Clone, Debug)]\npub struct CursorWriter<W> {\n    wtr: W,\n    pos: usize,\n}\n\nimpl CursorWriter<io::BufWriter<File>> {\n    /// Create a new cursor writer that will write to a file at the given path.\n    /// The file is truncated before writing.\n    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {\n        let file = create_file(path)?;\n        Ok(CursorWriter::new(io::BufWriter::new(file)))\n    }\n}\n\nimpl<W: io::Write> CursorWriter<W> {\n    /// Wrap the given writer with a counter.\n    pub fn new(wtr: W) -> CursorWriter<W> {\n        CursorWriter { wtr, pos: 0 }\n    }\n\n    /// Return the current position of this writer.\n    pub fn position(&self) -> usize {\n        self.pos\n    }\n\n    /// Write a u16LE.\n    pub fn write_u16(&mut self, n: u16) -> io::Result<()> {\n        self.write_all(&n.to_le_bytes())\n    }\n\n    /// Write a u32LE.\n    pub fn write_u32(&mut self, n: u32) -> io::Result<()> {\n        self.write_all(&n.to_le_bytes())\n    }\n\n    /// Write a u64LE.\n    pub fn write_u64(&mut self, n: u64) -> io::Result<()> {\n        self.write_all(&n.to_le_bytes())\n    }\n}\n\nimpl<W: io::Write> io::Write for CursorWriter<W> {\n    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {\n        let n = self.wtr.write(buf)?;\n        self.pos += n;\n        Ok(n)\n    }\n\n    fn flush(&mut self) -> io::Result<()> {\n        self.wtr.flush()\n    }\n}\n"
  },
  {
    "path": "imdb-index/src/lib.rs",
    "content": "/*!\nThis crate provides an on-disk indexing data structure for searching IMDb.\nSearching is primarily done using information retrieval techniques, which\nsupport fuzzy name queries and using TF-IDF-like ranking functions.\n*/\n\n#![deny(missing_docs)]\n\npub use crate::error::{Error, ErrorKind, Result};\npub use crate::index::{\n    AKARecordIter, Index, IndexBuilder, MediaEntity, NameQuery, NameScorer,\n    NgramType,\n};\npub use crate::record::{Episode, Rating, Title, TitleKind, AKA};\npub use crate::scored::{Scored, SearchResults};\npub use crate::search::{Query, Searcher, Similarity};\n\n// A macro that creates an error that represents a bug.\n//\n// This is typically used when reading index structures from disk. Since the\n// data on disk is generally outside our control, we return an error using this\n// macro instead of panicking (or worse, silently misinterpreting data).\nmacro_rules! bug {\n    ($($tt:tt)*) => {{\n        return Err($crate::error::Error::bug(format!($($tt)*)));\n    }}\n}\n\nmod error;\nmod index;\nmod record;\nmod scored;\nmod search;\nmod util;\n"
  },
  {
    "path": "imdb-index/src/record.rs",
    "content": "use std::cmp;\nuse std::fmt;\nuse std::str::FromStr;\n\nuse serde::{Deserialize, Deserializer, Serialize};\n\nuse crate::error::Error;\n\n/// An IMDb title record.\n///\n/// This is the primary type of an IMDb media entry. This record defines the\n/// identifier of an IMDb title, which serves as a foreign key in other data\n/// files (such as alternate names, episodes and ratings).\n#[derive(Clone, Debug, Deserialize)]\npub struct Title {\n    /// An IMDb identifier.\n    ///\n    /// Generally, this is a fixed width string beginning with the characters\n    /// `tt`.\n    #[serde(rename = \"tconst\")]\n    pub id: String,\n    /// The specific type of a title, e.g., movie, TV show, episode, etc.\n    #[serde(rename = \"titleType\")]\n    pub kind: TitleKind,\n    /// The primary name of this title.\n    #[serde(rename = \"primaryTitle\")]\n    pub title: String,\n    /// The \"original\" name of this title.\n    #[serde(rename = \"originalTitle\")]\n    pub original_title: String,\n    /// Whether this title is classified as \"adult\" material or not.\n    #[serde(rename = \"isAdult\", deserialize_with = \"number_as_bool\")]\n    pub is_adult: bool,\n    /// The start year of this title.\n    ///\n    /// Generally, things like movies or TV episodes have a start year to\n    /// indicate their release year and no end year. TV shows also have a start\n    /// year. TV shows that are still airing lack an end time, but TV shows\n    /// that have stopped will typically have an end year indicating when it\n    /// stopped airing.\n    ///\n    /// Note that not all titles have a start year.\n    #[serde(rename = \"startYear\", deserialize_with = \"csv::invalid_option\")]\n    pub start_year: Option<u32>,\n    /// The end year of this title.\n    ///\n    /// This is typically used to indicate the ending year of a TV show that\n    /// has stopped production.\n    #[serde(rename = \"endYear\", deserialize_with = \"csv::invalid_option\")]\n    pub end_year: Option<u32>,\n    /// The runtime, in minutes, of this title.\n    #[serde(\n        rename = \"runtimeMinutes\",\n        deserialize_with = \"csv::invalid_option\"\n    )]\n    pub runtime_minutes: Option<u32>,\n    /// A comma separated string of genres.\n    #[serde(rename = \"genres\")]\n    pub genres: String,\n}\n\n/// The kind of a title. These form a partioning of all titles, where every\n/// title has exactly one kind.\n///\n/// This type has a `FromStr` implementation that permits parsing a string\n/// containing a title kind into this type. Note that parsing a title kind\n/// recognizes all forms present in the IMDb data, and also addition common\n/// sense forms. For example, `tvshow` and `tvSeries` are both accepted as\n/// terms for the `TVSeries` variant.\n#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]\n#[allow(missing_docs)]\npub enum TitleKind {\n    #[serde(rename = \"movie\")]\n    Movie,\n    #[serde(rename = \"short\")]\n    Short,\n    #[serde(rename = \"tvEpisode\")]\n    TVEpisode,\n    #[serde(rename = \"tvMiniSeries\")]\n    TVMiniSeries,\n    #[serde(rename = \"tvMovie\")]\n    TVMovie,\n    #[serde(rename = \"tvSeries\")]\n    TVSeries,\n    #[serde(rename = \"tvShort\")]\n    TVShort,\n    #[serde(rename = \"tvSpecial\")]\n    TVSpecial,\n    #[serde(rename = \"video\")]\n    Video,\n    #[serde(rename = \"videoGame\")]\n    VideoGame,\n}\n\nimpl TitleKind {\n    /// Return a string representation of this title kind.\n    ///\n    /// This string representation is intended to be the same string\n    /// representation used in the IMDb data files.\n    pub fn as_str(&self) -> &'static str {\n        use self::TitleKind::*;\n        match *self {\n            Movie => \"movie\",\n            Short => \"short\",\n            TVEpisode => \"tvEpisode\",\n            TVMiniSeries => \"tvMiniSeries\",\n            TVMovie => \"tvMovie\",\n            TVSeries => \"tvSeries\",\n            TVShort => \"tvShort\",\n            TVSpecial => \"tvSpecial\",\n            Video => \"video\",\n            VideoGame => \"videoGame\",\n        }\n    }\n\n    /// Returns true if and only if this kind represents a TV series.\n    pub fn is_tv_series(&self) -> bool {\n        use self::TitleKind::*;\n\n        match *self {\n            TVMiniSeries | TVSeries => true,\n            _ => false,\n        }\n    }\n}\n\nimpl fmt::Display for TitleKind {\n    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {\n        write!(f, \"{}\", self.as_str())\n    }\n}\n\nimpl Ord for TitleKind {\n    fn cmp(&self, other: &TitleKind) -> cmp::Ordering {\n        self.as_str().cmp(other.as_str())\n    }\n}\n\nimpl PartialOrd for TitleKind {\n    fn partial_cmp(&self, other: &TitleKind) -> Option<cmp::Ordering> {\n        Some(self.cmp(other))\n    }\n}\n\nimpl FromStr for TitleKind {\n    type Err = Error;\n\n    fn from_str(ty: &str) -> Result<TitleKind, Error> {\n        use self::TitleKind::*;\n\n        match &*ty.to_lowercase() {\n            \"movie\" => Ok(Movie),\n            \"short\" => Ok(Short),\n            \"tvepisode\" | \"episode\" => Ok(TVEpisode),\n            \"tvminiseries\" | \"miniseries\" => Ok(TVMiniSeries),\n            \"tvmovie\" => Ok(TVMovie),\n            \"tvseries\" | \"tvshow\" | \"show\" => Ok(TVSeries),\n            \"tvshort\" => Ok(TVShort),\n            \"tvspecial\" | \"special\" => Ok(TVSpecial),\n            \"video\" => Ok(Video),\n            \"videogame\" | \"game\" => Ok(VideoGame),\n            unk => Err(Error::unknown_title(unk)),\n        }\n    }\n}\n\n/// A single alternate name.\n///\n/// Every title has one or more names, and zero or more alternate names. To\n/// represent multiple names, AKA or \"also known as\" records are provided.\n/// There may be many AKA records for a single title.\n#[derive(Clone, Debug, Deserialize)]\npub struct AKA {\n    /// The IMDb identifier that these AKA records describe.\n    #[serde(rename = \"titleId\")]\n    pub id: String,\n    /// The order in which an AKA record should be preferred.\n    #[serde(rename = \"ordering\")]\n    pub order: i32,\n    /// The alternate name.\n    #[serde(rename = \"title\")]\n    pub title: String,\n    /// A geographic region in which this alternate name applies.\n    #[serde(rename = \"region\")]\n    pub region: String,\n    /// The language of this alternate name.\n    #[serde(rename = \"language\")]\n    pub language: String,\n    /// A comma separated list of types for this name.\n    #[serde(rename = \"types\")]\n    pub types: String,\n    /// A comma separated list of attributes for this name.\n    #[serde(rename = \"attributes\")]\n    pub attributes: String,\n    /// A flag indicating whether this corresponds to the original title or\n    /// not.\n    #[serde(\n        rename = \"isOriginalTitle\",\n        deserialize_with = \"optional_number_as_bool\"\n    )]\n    pub is_original_title: Option<bool>,\n}\n\n/// A single episode record.\n///\n/// An episode record is an entry that joins two title records together, and\n/// provides episode specific information, such as the season and episode\n/// number. The two title records joined correspond to the title record for the\n/// TV show and the title record for the episode.\n#[derive(Clone, Debug, Deserialize)]\npub struct Episode {\n    /// The IMDb title identifier for this episode.\n    #[serde(rename = \"tconst\")]\n    pub id: String,\n    /// The IMDb title identifier for the parent TV show of this episode.\n    #[serde(rename = \"parentTconst\")]\n    pub tvshow_id: String,\n    /// The season in which this episode is contained, if it exists.\n    #[serde(\n        rename = \"seasonNumber\",\n        deserialize_with = \"csv::invalid_option\"\n    )]\n    pub season: Option<u32>,\n    /// The episode number of the season in which this episode is contained, if\n    /// it exists.\n    #[serde(\n        rename = \"episodeNumber\",\n        deserialize_with = \"csv::invalid_option\"\n    )]\n    pub episode: Option<u32>,\n}\n\n/// A rating associated with a single title record.\n#[derive(Clone, Debug, Deserialize)]\npub struct Rating {\n    /// The IMDb title identifier for this rating.\n    #[serde(rename = \"tconst\")]\n    pub id: String,\n    /// The rating, on a scale of 0 to 10, for this title.\n    #[serde(rename = \"averageRating\")]\n    pub rating: f32,\n    /// The number of votes involved in this rating.\n    #[serde(rename = \"numVotes\")]\n    pub votes: u32,\n}\n\nfn number_as_bool<'de, D>(de: D) -> Result<bool, D::Error>\nwhere\n    D: Deserializer<'de>,\n{\n    i32::deserialize(de).map(|n| n != 0)\n}\n\nfn optional_number_as_bool<'de, D>(de: D) -> Result<Option<bool>, D::Error>\nwhere\n    D: Deserializer<'de>,\n{\n    Ok(i32::deserialize(de).map(|n| Some(n != 0)).unwrap_or(None))\n}\n"
  },
  {
    "path": "imdb-index/src/scored.rs",
    "content": "use std::cmp;\nuse std::collections::BinaryHeap;\nuse std::num::FpCategory;\nuse std::vec;\n\n/// A collection of scored values, sorted in descending order by score.\n#[derive(Clone, Debug, Default)]\npub struct SearchResults<T>(Vec<Scored<T>>);\n\nimpl<T> SearchResults<T> {\n    /// Create an empty collection of scored values.\n    pub fn new() -> SearchResults<T> {\n        SearchResults(vec![])\n    }\n\n    /// Create a collection of search results from a min-heap of scored values.\n    pub fn from_min_heap(\n        queue: &mut BinaryHeap<cmp::Reverse<Scored<T>>>,\n    ) -> SearchResults<T> {\n        let mut results = vec![];\n        while let Some(x) = queue.pop() {\n            results.push(x.0);\n        }\n        results.reverse();\n        SearchResults(results)\n    }\n\n    /// Add a new scored value to this collection.\n    ///\n    /// The score provided must be less than or equal to every other score in\n    /// this collection, otherwise this method will panic.\n    pub fn push(&mut self, scored: Scored<T>) {\n        assert!(self.0.last().map_or(true, |smallest| &scored <= smallest));\n        self.0.push(scored);\n    }\n\n    /// Normalizes the scores in this collection such that all scores are in\n    /// the range `[0, 1]` where the top result always has score `1.0`.\n    ///\n    /// This operation is idempotent and does not change the ordering of\n    /// results.\n    pub fn normalize(&mut self) {\n        if let Some(top_score) = self.0.get(0).map(|s| s.score()) {\n            // The minimal score is 0, so if the top score is 0, then all\n            // scores must be 0. No normalization needed. (And we avoid a\n            // divide-by-zero below.)\n            if top_score.classify() == FpCategory::Zero {\n                return;\n            }\n            for result in &mut self.0 {\n                let score = result.score();\n                result.set_score(score / top_score);\n            }\n        }\n    }\n\n    /// Recomputes the scores in this collection using the given function.\n    ///\n    /// The results are then re-sorted according to the new scores.\n    pub fn rescore<F: FnMut(&T) -> f64>(&mut self, mut rescore: F) {\n        for result in &mut self.0 {\n            let score = rescore(result.value());\n            result.set_score(score);\n        }\n        self.0.sort_by(|s1, s2| s1.cmp(&s2).reverse());\n    }\n\n    /// Trim this collection so that it contains at most the first `size`\n    /// results.\n    pub fn trim(&mut self, size: usize) {\n        if self.0.len() > size {\n            self.0.drain(size..);\n        }\n    }\n\n    /// Returns the number of results in this collection.\n    pub fn len(&self) -> usize {\n        self.0.len()\n    }\n\n    /// Returns true if and only if this collection is empty.\n    pub fn is_empty(&self) -> bool {\n        self.0.is_empty()\n    }\n\n    /// Return a slice of search results in order.\n    pub fn as_slice(&self) -> &[Scored<T>] {\n        &self.0\n    }\n\n    /// Consume this collection and return the underlying sorted sequence of\n    /// scored values.\n    pub fn into_vec(self) -> Vec<Scored<T>> {\n        self.0\n    }\n}\n\nimpl<T> IntoIterator for SearchResults<T> {\n    type IntoIter = vec::IntoIter<Scored<T>>;\n    type Item = Scored<T>;\n\n    fn into_iter(self) -> vec::IntoIter<Scored<T>> {\n        self.into_vec().into_iter()\n    }\n}\n\n/// Any value associated with a score.\n///\n/// We define Eq and Ord on this type in a way that ignores `value` and only\n/// uses the `score` to determine ordering. The public API of `Scored`\n/// guarantees that scores are never `NaN`.\n#[derive(Clone, Copy, Debug)]\npub struct Scored<T> {\n    score: f64,\n    value: T,\n}\n\nimpl<T> Scored<T> {\n    /// Create a new value `T` with a score of `1.0`.\n    pub fn new(value: T) -> Scored<T> {\n        Scored { score: 1.0, value }\n    }\n\n    /// Return the score for this item.\n    ///\n    /// In general, no restrictions are placed on the range of scores, however\n    /// most search APIs that use it will return scores in the range `[0, 1]`.\n    ///\n    /// The score returned is guaranteed to never be `NaN`.\n    pub fn score(&self) -> f64 {\n        self.score\n    }\n\n    /// Set the score, replacing the existing value with the given value.\n    ///\n    /// This panics if the given score is `NaN`.\n    pub fn set_score(&mut self, score: f64) {\n        assert!(score.is_finite());\n        self.score = score;\n    }\n\n    /// Consume this scored value and return a new scored value that drops the\n    /// existing score and replaces it with the given score.\n    ///\n    /// This panics if the given score is `NaN`.\n    pub fn with_score(mut self, score: f64) -> Scored<T> {\n        self.set_score(score);\n        self\n    }\n\n    /// Consume this scored value and map its value using the function given,\n    /// returning a new scored value with the result of the map and an\n    /// unchanged score.\n    pub fn map<U, F: FnOnce(T) -> U>(self, f: F) -> Scored<U> {\n        Scored { score: self.score, value: f(self.value) }\n    }\n\n    /// Consume this scored value and map its score using the function given,\n    /// return a new `Scored` with an unchanged value.\n    ///\n    /// This panics if score returned by `f` is `NaN`.\n    pub fn map_score<F: FnOnce(f64) -> f64>(self, f: F) -> Scored<T> {\n        let score = f(self.score);\n        self.with_score(score)\n    }\n\n    /// Return a reference to the underlying value.\n    pub fn value(&self) -> &T {\n        &self.value\n    }\n\n    /// Consume this scored value, drop the score and return the underlying\n    /// `T`.\n    pub fn into_value(self) -> T {\n        self.value\n    }\n\n    /// Consume this scored value and return the underlying pair of score and\n    /// `T`.\n    pub fn into_pair(self) -> (f64, T) {\n        (self.score, self.value)\n    }\n}\n\nimpl<T: Default> Default for Scored<T> {\n    fn default() -> Scored<T> {\n        Scored::new(T::default())\n    }\n}\n\nimpl<T> Eq for Scored<T> {}\n\nimpl<T> PartialEq for Scored<T> {\n    fn eq(&self, other: &Scored<T>) -> bool {\n        let (s1, s2) = (self.score, other.score);\n        s1 == s2\n    }\n}\n\nimpl<T> Ord for Scored<T> {\n    fn cmp(&self, other: &Scored<T>) -> cmp::Ordering {\n        self.score.partial_cmp(&other.score).unwrap()\n    }\n}\n\nimpl<T> PartialOrd for Scored<T> {\n    fn partial_cmp(&self, other: &Scored<T>) -> Option<cmp::Ordering> {\n        Some(self.cmp(other))\n    }\n}\n\n#[cfg(test)]\nmod tests {\n    use super::Scored;\n    use std::f64::NAN;\n\n    #[test]\n    #[should_panic]\n    fn never_nan_1() {\n        Scored::new(()).set_score(NAN);\n    }\n\n    #[test]\n    #[should_panic]\n    fn never_nan_2() {\n        Scored::new(()).with_score(NAN);\n    }\n\n    #[test]\n    #[should_panic]\n    fn never_nan_3() {\n        Scored::new(()).map_score(|_| NAN);\n    }\n}\n"
  },
  {
    "path": "imdb-index/src/search.rs",
    "content": "use std::cmp;\nuse std::f64;\nuse std::fmt;\nuse std::result;\nuse std::str::FromStr;\n\nuse lazy_static::lazy_static;\nuse regex::Regex;\nuse serde::{Deserialize, Deserializer, Serialize, Serializer};\n\nuse crate::error::{Error, Result};\nuse crate::index::{Index, MediaEntity, NameQuery, NameScorer};\nuse crate::record::{Episode, Rating, Title, TitleKind};\nuse crate::scored::{Scored, SearchResults};\nuse crate::util::{csv_file, IMDB_BASICS};\n\n/// A handle that permits searching IMDb media records with relevance ranking.\n///\n/// A searcher is constructed by providing it a handle to an IMDb\n/// [`Index`](struct.Index.html). The `Index` is responsible for managing the\n/// lower level data access, while the `Searcher` provides high level routines\n/// for ranking results.\n///\n/// The primary interface to a `Searcher` is its `search` method, which takes\n/// as input a [`Query`](struct.Query.html) and returns a ranked list of\n/// [`MediaEntity`](struct.MediaEntity.html) as output.\n#[derive(Debug)]\npub struct Searcher {\n    idx: Index,\n}\n\nimpl Searcher {\n    /// Create a new searcher for the given `Index`.\n    ///\n    /// A single searcher can be used to execute many queries.\n    ///\n    /// An existing `Index` can be opened with `Index::open`, and a new `Index`\n    /// can be created with `Index::create`.\n    pub fn new(idx: Index) -> Searcher {\n        Searcher { idx }\n    }\n\n    /// Execute a search with the given `Query`.\n    ///\n    /// Generally, the results returned are ranked in relevance order, where\n    /// each result has a score associated with it. The score is between\n    /// `0` and `1.0` (inclusive), where a score of `1.0` means \"most similar\"\n    /// and a score of `0` means \"least similar.\"\n    ///\n    /// Depending on the query, the behavior of search can vary:\n    ///\n    /// * When the query specifies a similarity function, then the results are\n    ///   ranked by that function.\n    /// * When the query contains a name to search by and a name scorer, then\n    ///   results are ranked by the name scorer. If the query specifies a\n    ///   similarity function, then results are first ranked by the name\n    ///   scorer, and then re-ranked by the similarity function.\n    /// * When no name or no name scorer are specified by the query, then\n    ///   this search will do a (slow) exhaustive search over all media records\n    ///   in IMDb. As a special case, if the query contains a TV show ID, then\n    ///   only records in that TV show are searched, and this is generally\n    ///   fast.\n    /// * If the query is empty, then no results are returned.\n    ///\n    /// If there was a problem reading the underlying index or the IMDb data,\n    /// then an error is returned.\n    pub fn search(\n        &mut self,\n        query: &Query,\n    ) -> Result<SearchResults<MediaEntity>> {\n        if query.is_empty() {\n            return Ok(SearchResults::new());\n        }\n        let mut results = match query.name_query() {\n            None => self.search_exhaustive(query)?,\n            Some(nameq) => self.search_with_name(query, &nameq)?,\n        };\n        results.trim(query.size);\n        results.normalize();\n        Ok(results)\n    }\n\n    /// Return a mutable reference to the underlying index for this searcher.\n    pub fn index(&mut self) -> &mut Index {\n        &mut self.idx\n    }\n\n    fn search_with_name(\n        &mut self,\n        query: &Query,\n        name_query: &NameQuery,\n    ) -> Result<SearchResults<MediaEntity>> {\n        let mut results = SearchResults::new();\n        for r in self.idx.search(name_query)? {\n            if query.similarity.is_none() && results.len() >= query.size {\n                break;\n            }\n            let (score, title) = r.into_pair();\n            let entity = self.idx.entity_from_title(title)?;\n            if query.matches(&entity) {\n                results.push(Scored::new(entity).with_score(score));\n            }\n        }\n        if !query.similarity.is_none() {\n            results.rescore(|e| self.similarity(query, &e.title().title));\n        }\n        Ok(results)\n    }\n\n    fn search_exhaustive(\n        &mut self,\n        query: &Query,\n    ) -> Result<SearchResults<MediaEntity>> {\n        if let Some(ref tvshow_id) = query.tvshow_id {\n            return self.search_with_tvshow(query, tvshow_id);\n        }\n\n        let mut rdr = csv_file(self.idx.data_dir().join(IMDB_BASICS))?;\n        if !query.has_filters() {\n            let mut nresults = SearchResults::new();\n            let mut record = csv::StringRecord::new();\n            while rdr.read_record(&mut record).map_err(Error::csv)? {\n                let id_title = (record[0].to_string(), record[2].to_string());\n                nresults.push(Scored::new(id_title));\n            }\n            nresults.rescore(|t| self.similarity(query, &t.1));\n\n            let mut results = SearchResults::new();\n            for nresult in nresults.into_vec().into_iter().take(query.size) {\n                let (score, (id, _)) = nresult.into_pair();\n                let entity = match self.idx.entity(&id)? {\n                    None => continue,\n                    Some(entity) => entity,\n                };\n                results.push(Scored::new(entity).with_score(score));\n            }\n            Ok(results)\n        } else if query.needs_only_title() {\n            let mut tresults = SearchResults::new();\n            for result in rdr.deserialize() {\n                let title: Title = result.map_err(Error::csv)?;\n                if query.matches_title(&title) {\n                    tresults.push(Scored::new(title));\n                }\n            }\n            tresults.rescore(|t| self.similarity(query, &t.title));\n\n            let mut results = SearchResults::new();\n            for tresult in tresults.into_vec().into_iter().take(query.size) {\n                let (score, title) = tresult.into_pair();\n                let entity = self.idx.entity_from_title(title)?;\n                results.push(Scored::new(entity).with_score(score));\n            }\n            Ok(results)\n        } else {\n            let mut results = SearchResults::new();\n            for result in rdr.deserialize() {\n                let title = result.map_err(Error::csv)?;\n                let entity = self.idx.entity_from_title(title)?;\n                if query.matches(&entity) {\n                    results.push(Scored::new(entity));\n                }\n            }\n            results.rescore(|e| self.similarity(query, &e.title().title));\n            Ok(results)\n        }\n    }\n\n    fn search_with_tvshow(\n        &mut self,\n        query: &Query,\n        tvshow_id: &str,\n    ) -> Result<SearchResults<MediaEntity>> {\n        let mut results = SearchResults::new();\n        for ep in self.idx.seasons(tvshow_id)? {\n            let entity = match self.idx.entity(&ep.id)? {\n                None => continue,\n                Some(entity) => entity,\n            };\n            if query.matches(&entity) {\n                results.push(Scored::new(entity));\n            }\n        }\n        if !query.similarity.is_none() {\n            results.rescore(|e| self.similarity(query, &e.title().title));\n        }\n        Ok(results)\n    }\n\n    fn similarity(&self, query: &Query, name: &str) -> f64 {\n        match query.name {\n            None => 0.0,\n            Some(ref qname) => query.similarity.similarity(qname, name),\n        }\n    }\n}\n\n/// A query that can be used to search IMDb media records.\n///\n/// A query typically consists of a fuzzy name query along with zero or more\n/// filters. If a query lacks a fuzzy name query, then this will generally\n/// result in an exhaustive search of all IMDb media records, which can be\n/// slow.\n///\n/// Filters are matched conjunctively. That is, a search result must satisfy\n/// every filter on a query to match.\n///\n/// Empty queries always return no results.\n///\n/// The `Serialize` and `Deserialize` implementations for this type use the\n/// free-form query syntax.\n#[derive(Clone, Debug, Eq, Hash, PartialEq)]\npub struct Query {\n    name: Option<String>,\n    name_scorer: Option<NameScorer>,\n    similarity: Similarity,\n    size: usize,\n    kinds: Vec<TitleKind>,\n    year: Range<u32>,\n    votes: Range<u32>,\n    season: Range<u32>,\n    episode: Range<u32>,\n    tvshow_id: Option<String>,\n}\n\nimpl Default for Query {\n    fn default() -> Query {\n        Query::new()\n    }\n}\n\nimpl Query {\n    /// Create a new empty query.\n    pub fn new() -> Query {\n        Query {\n            name: None,\n            name_scorer: Some(NameScorer::default()),\n            similarity: Similarity::default(),\n            size: 30,\n            kinds: vec![],\n            year: Range::none(),\n            votes: Range::none(),\n            season: Range::none(),\n            episode: Range::none(),\n            tvshow_id: None,\n        }\n    }\n\n    /// Return true if and only if this query is empty.\n    ///\n    /// Searching with an empty query always yields no results.\n    pub fn is_empty(&self) -> bool {\n        self.name.as_ref().map_or(true, |n| n.is_empty())\n            && self.kinds.is_empty()\n            && self.year.is_none()\n            && self.votes.is_none()\n            && self.season.is_none()\n            && self.episode.is_none()\n            && self.tvshow_id.is_none()\n    }\n\n    /// Set the name to query by.\n    ///\n    /// The name given here is normalized and broken down into components\n    /// automatically to facilitate fuzzy searching.\n    ///\n    /// Note that if no name is provided in a query, then it is possible that\n    /// searching with the query will require exhaustively looking at every\n    /// record in IMDb. This will be slower.\n    pub fn name(mut self, name: &str) -> Query {\n        self.name = Some(name.to_string());\n        self\n    }\n\n    /// Set the scorer to use for name searches.\n    ///\n    /// The name scorer is used to rank results from searching the IMDb name\n    /// index. If no name query is given, then this scorer is not used.\n    ///\n    /// If `None` is provided here, then the name index will not be used. This\n    /// will likely cause an exhaustive search of all IMDb records, which can\n    /// be slow. The use case for providing a name query without a name scorer\n    /// is if you, for example, wanted to rank all of the records in IMDb\n    /// by the Levenshtein distance between your query and every other record\n    /// in IMDb. Normally, when the name index is used, only the (small number)\n    /// of results returned by searching the name are ranked. Typically, these\n    /// sorts of queries are useful for evaluation purposes, but not much else.\n    pub fn name_scorer(mut self, scorer: Option<NameScorer>) -> Query {\n        self.name_scorer = scorer;\n        self\n    }\n\n    /// Set the similarity function.\n    ///\n    /// The similarity function can be selected from a predefined set of\n    /// choices defined by the\n    /// [`Similarity`](enum.Similarity.html) type.\n    ///\n    /// When a similarity function is used, then any results from searching\n    /// the name index are re-ranked according to their similarity with the\n    /// query.\n    ///\n    /// By default, no similarity function is used.\n    pub fn similarity(mut self, sim: Similarity) -> Query {\n        self.similarity = sim;\n        self\n    }\n\n    /// Set the maximum number of results to be returned by a search.\n    ///\n    /// Note that setting this number too high (e.g., `> 10,000`) can impact\n    /// performance. This is a normal restriction found in most information\n    /// retrieval systems. That is, deep paging through result sets is\n    /// expensive.\n    pub fn size(mut self, size: usize) -> Query {\n        self.size = size;\n        self\n    }\n\n    /// Add a title kind to filter by.\n    ///\n    /// Multiple title kinds can be added to query, and search results must\n    /// match at least one of them.\n    ///\n    /// Note that it is not possible to remove title kinds from an existing\n    /// query. Instead, build a new query from scratch.\n    pub fn kind(mut self, kind: TitleKind) -> Query {\n        if !self.kinds.contains(&kind) {\n            self.kinds.push(kind);\n        }\n        self\n    }\n\n    /// Set the lower inclusive bound on a title's year.\n    ///\n    /// This applies to either the title's start or end years.\n    pub fn year_ge(mut self, year: u32) -> Query {\n        self.year.start = Some(year);\n        self\n    }\n\n    /// Set the upper inclusive bound on a title's year.\n    ///\n    /// This applies to either the title's start or end years.\n    pub fn year_le(mut self, year: u32) -> Query {\n        self.year.end = Some(year);\n        self\n    }\n\n    /// Set the lower inclusive bound on a title's number of votes.\n    pub fn votes_ge(mut self, votes: u32) -> Query {\n        self.votes.start = Some(votes);\n        self\n    }\n\n    /// Set the upper inclusive bound on a title's number of votes.\n    pub fn votes_le(mut self, votes: u32) -> Query {\n        self.votes.end = Some(votes);\n        self\n    }\n\n    /// Set the lower inclusive bound on a title's season.\n    ///\n    /// This automatically limits all results to episodes.\n    pub fn season_ge(mut self, season: u32) -> Query {\n        self.season.start = Some(season);\n        self\n    }\n\n    /// Set the upper inclusive bound on a title's season.\n    ///\n    /// This automatically limits all results to episodes.\n    pub fn season_le(mut self, season: u32) -> Query {\n        self.season.end = Some(season);\n        self\n    }\n\n    /// Set the lower inclusive bound on a title's episode number.\n    ///\n    /// This automatically limits all results to episodes.\n    pub fn episode_ge(mut self, episode: u32) -> Query {\n        self.episode.start = Some(episode);\n        self\n    }\n\n    /// Set the upper inclusive bound on a title's episode number.\n    ///\n    /// This automatically limits all results to episodes.\n    pub fn episode_le(mut self, episode: u32) -> Query {\n        self.episode.end = Some(episode);\n        self\n    }\n\n    /// Restrict results to episodes belonging to the TV show given by its\n    /// IMDb ID.\n    ///\n    /// This automatically limits all results to episodes.\n    pub fn tvshow_id(mut self, tvshow_id: &str) -> Query {\n        self.tvshow_id = Some(tvshow_id.to_string());\n        self\n    }\n\n    /// Returns true if and only if the given entity matches this query.\n    ///\n    /// Note that this only applies filters in this query. e.g., The name\n    /// aspect of the query, if one exists, is ignored.\n    fn matches(&self, ent: &MediaEntity) -> bool {\n        self.matches_title(&ent.title())\n            && self.matches_rating(ent.rating())\n            && self.matches_episode(ent.episode())\n    }\n\n    /// Returns true if and only if the given title matches this query.\n    ///\n    /// This ignores non-title filters.\n    fn matches_title(&self, title: &Title) -> bool {\n        if !self.kinds.is_empty() && !self.kinds.contains(&title.kind) {\n            return false;\n        }\n        if !self.year.contains(title.start_year.as_ref())\n            && !self.year.contains(title.end_year.as_ref())\n        {\n            return false;\n        }\n        true\n    }\n\n    /// Returns true if and only if the given rating matches this query.\n    ///\n    /// This ignores non-rating filters.\n    ///\n    /// If a rating filter is present and `None` is given, then this always\n    /// returns `false`.\n    fn matches_rating(&self, rating: Option<&Rating>) -> bool {\n        if !self.votes.contains(rating.map(|r| &r.votes)) {\n            return false;\n        }\n        true\n    }\n\n    /// Returns true if and only if the given episode matches this query.\n    ///\n    /// This ignores non-episode filters.\n    ///\n    /// If an episode filter is present and `None` is given, then this always\n    /// returns `false`.\n    fn matches_episode(&self, ep: Option<&Episode>) -> bool {\n        if !self.season.contains(ep.and_then(|e| e.season.as_ref())) {\n            return false;\n        }\n        if !self.episode.contains(ep.and_then(|e| e.episode.as_ref())) {\n            return false;\n        }\n        if let Some(ref tvshow_id) = self.tvshow_id {\n            if ep.map_or(true, |e| tvshow_id != &e.tvshow_id) {\n                return false;\n            }\n        }\n        true\n    }\n\n    /// Build a name query suitable for this query.\n    ///\n    /// The name query returned may request many more results than the result\n    /// size maximum on this query.\n    fn name_query(&self) -> Option<NameQuery> {\n        let name = match self.name.as_ref() {\n            None => return None,\n            Some(name) => &**name,\n        };\n        let scorer = match self.name_scorer {\n            None => return None,\n            Some(scorer) => scorer,\n        };\n        // We want our name query to return a healthy set of results, even if\n        // it's well beyond the result set size requested by the user. This is\n        // primarily because a name search doesn't incorporate filters itself,\n        // which simplifies the implementation. Therefore, we need to request\n        // more results than what we need in case our filter is aggressive.\n        let size = cmp::max(1000, self.size);\n        Some(NameQuery::new(name).with_size(size).with_scorer(scorer))\n    }\n\n    /// Returns true if and only if this query has any filters.\n    ///\n    /// When a query lacks filters, then the result set can be completely\n    /// determined by searching the name index and applying a similarity\n    /// function, if present. This can make exhaustive searches, particularly\n    /// the ones used during an evaluation, a bit faster.\n    fn has_filters(&self) -> bool {\n        self.needs_rating()\n            || self.needs_episode()\n            || !self.kinds.is_empty()\n            || !self.year.is_none()\n    }\n\n    /// Returns true if and only this query has only title filters.\n    ///\n    /// When true, this can make exhaustive searches faster by avoiding the\n    /// need to fetch the rating and/or episode for every title in IMDb.\n    fn needs_only_title(&self) -> bool {\n        !self.needs_rating() && !self.needs_episode()\n    }\n\n    /// Returns true if and only if this query has a rating filter.\n    fn needs_rating(&self) -> bool {\n        !self.votes.is_none()\n    }\n\n    /// Returns true if and only if this query has an episode filter.\n    fn needs_episode(&self) -> bool {\n        !self.season.is_none()\n            || !self.episode.is_none()\n            || !self.tvshow_id.is_none()\n    }\n}\n\nimpl Serialize for Query {\n    fn serialize<S>(&self, s: S) -> result::Result<S::Ok, S::Error>\n    where\n        S: Serializer,\n    {\n        s.serialize_str(&self.to_string())\n    }\n}\n\nimpl<'a> Deserialize<'a> for Query {\n    fn deserialize<D>(d: D) -> result::Result<Query, D::Error>\n    where\n        D: Deserializer<'a>,\n    {\n        use serde::de::Error;\n\n        let querystr = String::deserialize(d)?;\n        querystr\n            .parse()\n            .map_err(|e: self::Error| D::Error::custom(e.to_string()))\n    }\n}\n\nimpl FromStr for Query {\n    type Err = Error;\n\n    fn from_str(qstr: &str) -> Result<Query> {\n        lazy_static! {\n            // The 'directive', 'terms' and 'space' groups are all mutually\n            // exclusive. When 'directive' matches, we parse it using DIRECTIVE\n            // in a subsequent step. When 'terms' matches, we add them to the\n            // name query. Then 'space' matches, we ignore it.\n            static ref PARTS: Regex = Regex::new(\n                r\"\\{(?P<directive>[^}]+)\\}|(?P<terms>[^{}\\s]+)|(?P<space>\\s+)\"\n            ).unwrap();\n\n            // Parse a directive of the form '{name:val}' or '{kind}'.\n            static ref DIRECTIVE: Regex = Regex::new(\n                r\"^(?:(?P<name>[^:]+):(?P<val>.+)|(?P<kind>.+))$\"\n            ).unwrap();\n        }\n        let mut terms = vec![];\n        let mut q = Query::new();\n        for caps in PARTS.captures_iter(qstr) {\n            if caps.name(\"space\").is_some() {\n                continue;\n            } else if let Some(m) = caps.name(\"terms\") {\n                terms.push(m.as_str().to_string());\n                continue;\n            }\n\n            let dcaps = DIRECTIVE.captures(&caps[\"directive\"]).unwrap();\n            if let Some(m) = dcaps.name(\"kind\") {\n                q = q.kind(m.as_str().parse()?);\n                continue;\n            }\n\n            let (name, val) = (dcaps[\"name\"].trim(), dcaps[\"val\"].trim());\n            match name {\n                \"size\" => {\n                    q.size = val.parse().map_err(Error::number)?;\n                }\n                \"year\" => {\n                    q.year = val.parse()?;\n                }\n                \"votes\" => {\n                    q.votes = val.parse()?;\n                }\n                \"season\" => {\n                    q.season = val.parse()?;\n                }\n                \"episode\" => {\n                    q.episode = val.parse()?;\n                }\n                \"tvseries\" | \"tvshow\" | \"show\" => {\n                    q.tvshow_id = Some(val.to_string());\n                }\n                \"sim\" | \"similarity\" => {\n                    q.similarity = val.parse()?;\n                }\n                \"scorer\" => {\n                    if val == \"none\" {\n                        q.name_scorer = None;\n                    } else {\n                        q.name_scorer = Some(val.parse()?);\n                    }\n                }\n                unk => return Err(Error::unknown_directive(unk)),\n            }\n        }\n        if !terms.is_empty() {\n            q = q.name(&terms.join(\" \"));\n        }\n        Ok(q)\n    }\n}\n\nimpl fmt::Display for Query {\n    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {\n        match self.name_scorer {\n            None => f.write_str(\"{scorer:none}\")?,\n            Some(ref scorer) => write!(f, \"{{scorer:{}}}\", scorer)?,\n        }\n        write!(f, \" {{sim:{}}}\", self.similarity)?;\n        write!(f, \" {{size:{}}}\", self.size)?;\n\n        let mut kinds: Vec<&TitleKind> = self.kinds.iter().collect();\n        kinds.sort();\n        for kind in kinds {\n            write!(f, \" {{{}}}\", kind)?;\n        }\n        if !self.year.is_none() {\n            write!(f, \" {{year:{}}}\", self.year)?;\n        }\n        if !self.votes.is_none() {\n            write!(f, \" {{votes:{}}}\", self.votes)?;\n        }\n        if !self.season.is_none() {\n            write!(f, \" {{season:{}}}\", self.season)?;\n        }\n        if !self.episode.is_none() {\n            write!(f, \" {{episode:{}}}\", self.episode)?;\n        }\n        if let Some(ref tvshow_id) = self.tvshow_id {\n            write!(f, \" {{show:{}}}\", tvshow_id)?;\n        }\n        if let Some(ref name) = self.name {\n            write!(f, \" {}\", name)?;\n        }\n        Ok(())\n    }\n}\n\n/// A ranking function to use when searching IMDb records.\n///\n/// A similarity ranking function computes a score between `0.0` and `1.0` (not\n/// including `0` but including `1.0`) for a query and a candidate result. The\n/// score is determined by the corresponding names for a query and a candidate,\n/// and a higher score indicates more similarity.\n///\n/// This ranking function can be used to increase the precision of a set\n/// of results. In particular, when a similarity function is provided to\n/// a [`Query`](struct.Query.html), then any results returned by querying\n/// the IMDb name index will be rescored according to this function. If no\n/// similarity function is provided, then the results will be ranked according\n/// to scores produced by the name index.\n#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]\npub enum Similarity {\n    /// Do not use a similarity function.\n    None,\n    /// Computes the Levenshtein edit distance between two names and converts\n    /// it to a similarity.\n    Levenshtein,\n    /// Computes the Jaro edit distance between two names and converts it to a\n    /// similarity.\n    Jaro,\n    /// Computes the Jaro-Winkler edit distance between two names and converts\n    /// it to a similarity.\n    JaroWinkler,\n}\n\nimpl Similarity {\n    /// Returns a list of s trings representing the possible similarity\n    /// function names.\n    pub fn possible_names() -> &'static [&'static str] {\n        &[\"none\", \"levenshtein\", \"jaro\", \"jarowinkler\"]\n    }\n\n    /// Returns true if and only if no similarity function was selected.\n    pub fn is_none(&self) -> bool {\n        *self == Similarity::None\n    }\n\n    /// Computes the similarity between the given strings according to the\n    /// underlying similarity function. If no similarity function is present,\n    /// then this always returns `1.0`.\n    ///\n    /// The returned value is always in the range `(0, 1]`.\n    pub fn similarity(&self, q1: &str, q2: &str) -> f64 {\n        let sim = match *self {\n            Similarity::None => 1.0,\n            Similarity::Levenshtein => {\n                let distance = strsim::levenshtein(q1, q2) as f64;\n                // We do a simple conversion of distance to similarity. This\n                // will produce very low scores even for very similar names,\n                // but callers may normalize scores.\n                //\n                // We also add `1` to the denominator to avoid division by\n                // zero. Incidentally, this causes the similarity of identical\n                // strings to be exactly 1.0, which is what we want.\n                1.0 / (1.0 + distance)\n            }\n            Similarity::Jaro => strsim::jaro(q1, q2),\n            Similarity::JaroWinkler => strsim::jaro_winkler(q1, q2),\n        };\n        // Don't permit a score to actually be zero. This prevents division\n        // by zero during normalization if all results have a score of zero.\n        if sim < f64::EPSILON {\n            f64::EPSILON\n        } else {\n            sim\n        }\n    }\n}\n\nimpl Default for Similarity {\n    fn default() -> Similarity {\n        Similarity::None\n    }\n}\n\nimpl fmt::Display for Similarity {\n    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {\n        match *self {\n            Similarity::None => write!(f, \"none\"),\n            Similarity::Levenshtein => write!(f, \"levenshtein\"),\n            Similarity::Jaro => write!(f, \"jaro\"),\n            Similarity::JaroWinkler => write!(f, \"jarowinkler\"),\n        }\n    }\n}\n\nimpl FromStr for Similarity {\n    type Err = Error;\n\n    fn from_str(s: &str) -> Result<Similarity> {\n        match s {\n            \"none\" => Ok(Similarity::None),\n            \"levenshtein\" => Ok(Similarity::Levenshtein),\n            \"jaro\" => Ok(Similarity::Jaro),\n            \"jarowinkler\" | \"jaro-winkler\" => Ok(Similarity::JaroWinkler),\n            unk => Err(Error::unknown_sim(unk)),\n        }\n    }\n}\n\n/// A range filter over any partially ordered type `T`.\n///\n/// This type permits either end of the range to be unbounded.\n#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]\nstruct Range<T> {\n    start: Option<T>,\n    end: Option<T>,\n}\n\nimpl<T> Range<T> {\n    pub fn none() -> Range<T> {\n        Range { start: None, end: None }\n    }\n\n    pub fn is_none(&self) -> bool {\n        self.start.is_none() && self.end.is_none()\n    }\n}\n\nimpl<T: PartialOrd> Range<T> {\n    pub fn contains(&self, t: Option<&T>) -> bool {\n        let t = match t {\n            None => return self.is_none(),\n            Some(t) => t,\n        };\n        match (&self.start, &self.end) {\n            (&None, &None) => true,\n            (&Some(ref s), &None) => s <= t,\n            (&None, &Some(ref e)) => t <= e,\n            (&Some(ref s), &Some(ref e)) => s <= t && t <= e,\n        }\n    }\n}\n\nimpl<T: fmt::Display + PartialEq> fmt::Display for Range<T> {\n    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {\n        match (&self.start, &self.end) {\n            (&None, &None) => write!(f, \"-\"),\n            (&Some(ref s), &None) => write!(f, \"{}-\", s),\n            (&None, &Some(ref e)) => write!(f, \"-{}\", e),\n            (&Some(ref s), &Some(ref e)) if s == e => write!(f, \"{}\", s),\n            (&Some(ref s), &Some(ref e)) => write!(f, \"{}-{}\", s, e),\n        }\n    }\n}\n\nimpl<E: std::error::Error + Send + Sync + 'static, T: FromStr<Err = E>> FromStr\n    for Range<T>\n{\n    type Err = Error;\n\n    fn from_str(range: &str) -> Result<Range<T>> {\n        // One wonders what happens if we need to support ranges consisting\n        // of negative numbers. Thankfully, it seems we needn't do that for\n        // the IMDb data.\n        let (start, end) = match range.find('-') {\n            None => {\n                // For no particular reason, parse it twice so that we don't\n                // need a `Clone` bound.\n                let start = range.parse().map_err(Error::number)?;\n                let end = range.parse().map_err(Error::number)?;\n                return Ok(Range { start: Some(start), end: Some(end) });\n            }\n            Some(i) => {\n                let (start, end) = range.split_at(i);\n                (start.trim(), end[1..].trim())\n            }\n        };\n        Ok(match (start.is_empty(), end.is_empty()) {\n            (true, true) => Range::none(),\n            (true, false) => Range {\n                start: None,\n                end: Some(end.parse().map_err(Error::number)?),\n            },\n            (false, true) => Range {\n                start: Some(start.parse().map_err(Error::number)?),\n                end: None,\n            },\n            (false, false) => Range {\n                start: Some(start.parse().map_err(Error::number)?),\n                end: Some(end.parse().map_err(Error::number)?),\n            },\n        })\n    }\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n\n    #[test]\n    fn ranges() {\n        let r: Range<u32> = \"5-10\".parse().unwrap();\n        assert_eq!(r, Range { start: Some(5), end: Some(10) });\n\n        let r: Range<u32> = \"5-\".parse().unwrap();\n        assert_eq!(r, Range { start: Some(5), end: None });\n\n        let r: Range<u32> = \"-10\".parse().unwrap();\n        assert_eq!(r, Range { start: None, end: Some(10) });\n\n        let r: Range<u32> = \"5-5\".parse().unwrap();\n        assert_eq!(r, Range { start: Some(5), end: Some(5) });\n\n        let r: Range<u32> = \"5\".parse().unwrap();\n        assert_eq!(r, Range { start: Some(5), end: Some(5) });\n    }\n\n    #[test]\n    fn query_parser() {\n        let q: Query = \"foo bar baz\".parse().unwrap();\n        assert_eq!(q, Query::new().name(\"foo bar baz\"));\n\n        let q: Query = \"{movie}\".parse().unwrap();\n        assert_eq!(q, Query::new().kind(TitleKind::Movie));\n\n        let q: Query = \"{movie} {tvshow}\".parse().unwrap();\n        assert_eq!(\n            q,\n            Query::new().kind(TitleKind::Movie).kind(TitleKind::TVSeries)\n        );\n\n        let q: Query = \"{movie}{tvshow}\".parse().unwrap();\n        assert_eq!(\n            q,\n            Query::new().kind(TitleKind::Movie).kind(TitleKind::TVSeries)\n        );\n\n        let q: Query = \"foo {movie} bar {tvshow} baz\".parse().unwrap();\n        assert_eq!(\n            q,\n            Query::new()\n                .name(\"foo bar baz\")\n                .kind(TitleKind::Movie)\n                .kind(TitleKind::TVSeries)\n        );\n\n        let q: Query = \"{size:5}\".parse().unwrap();\n        assert_eq!(q, Query::new().size(5));\n\n        let q: Query = \"{ size : 5 }\".parse().unwrap();\n        assert_eq!(q, Query::new().size(5));\n\n        let q: Query = \"{year:1990}\".parse().unwrap();\n        assert_eq!(q, Query::new().year_ge(1990).year_le(1990));\n\n        let q: Query = \"{year:1990-}\".parse().unwrap();\n        assert_eq!(q, Query::new().year_ge(1990));\n\n        let q: Query = \"{year:-1990}\".parse().unwrap();\n        assert_eq!(q, Query::new().year_le(1990));\n\n        let q: Query = \"{year:-}\".parse().unwrap();\n        assert_eq!(q, Query::new());\n    }\n\n    #[test]\n    fn query_parser_error() {\n        assert!(\"{blah}\".parse::<Query>().is_err());\n        assert!(\"{size:a}\".parse::<Query>().is_err());\n        assert!(\"{year:}\".parse::<Query>().is_err());\n    }\n\n    #[test]\n    fn query_parser_weird() {\n        let q: Query = \"{movie\".parse().unwrap();\n        assert_eq!(q, Query::new().name(\"movie\"));\n\n        let q: Query = \"movie}\".parse().unwrap();\n        assert_eq!(q, Query::new().name(\"movie\"));\n    }\n\n    #[test]\n    fn query_display() {\n        let q = Query::new()\n            .name(\"foo bar baz\")\n            .size(31)\n            .season_ge(4)\n            .season_le(5)\n            .kind(TitleKind::TVSeries)\n            .kind(TitleKind::Movie)\n            .similarity(Similarity::Jaro);\n        let expected =\n            \"{scorer:okapibm25} {sim:jaro} {size:31} {movie} {tvSeries} {season:4-5} foo bar baz\";\n        assert_eq!(q.to_string(), expected);\n    }\n\n    #[test]\n    fn query_serialize() {\n        #[derive(Serialize)]\n        struct Test {\n            query: Query,\n        }\n        let query = Query::new()\n            .name(\"foo bar baz\")\n            .name_scorer(None)\n            .size(31)\n            .season_ge(4)\n            .season_le(4);\n        let got = serde_json::to_string(&Test { query }).unwrap();\n\n        let expected = r#\"{\"query\":\"{scorer:none} {sim:none} {size:31} {season:4} foo bar baz\"}\"#;\n        assert_eq!(got, expected);\n    }\n\n    #[test]\n    fn query_deserialize() {\n        let json = r#\"{\"query\": \"foo {size:30} bar {season:4} baz {show}\"}\"#;\n        let expected =\n            \"{size:30} {season:4} {show} foo bar baz\".parse().unwrap();\n\n        #[derive(Deserialize)]\n        struct Test {\n            query: Query,\n        }\n        let got: Test = serde_json::from_str(json).unwrap();\n        assert_eq!(got.query, expected);\n    }\n}\n"
  },
  {
    "path": "imdb-index/src/util.rs",
    "content": "use std::fmt;\nuse std::fs::File;\nuse std::io;\nuse std::path::Path;\nuse std::time;\n\nuse memmap::Mmap;\n\nuse crate::error::{Error, ErrorKind, Result};\n\n/// The TSV file in the IMDb dataset that defines the canonical set of titles\n/// available to us. Each record contains basic information about a title,\n/// such as its IMDb identifier (e.g., `tt0096697`), primary title, start year\n/// and type. This includes movies, TV shows, episodes and more.\npub const IMDB_BASICS: &str = \"title.basics.tsv\";\n\n/// The TSV file in the IMDb dataset that defines alternate names for some of\n/// the titles found in IMDB_BASICS. This includes, but is not limited to,\n/// titles in different languages. This file uses the IMDb identifier as a\n/// foreign key.\npub const IMDB_AKAS: &str = \"title.akas.tsv\";\n\n/// The TSV file in the IMDb dataset that defines the season and episode\n/// numbers for episodes in TV shows. Each record in this file corresponds to\n/// a single episode. There are four columns: the first is the IMDb identifier\n/// for the episode. The second is the IMDb identifier for the corresponding\n/// TV show. The last two columns are the season and episode numbers. Both of\n/// the IMDb identifiers are foreign keys that join the record to IMDB_BASICS.\npub const IMDB_EPISODE: &str = \"title.episode.tsv\";\n\n/// The TSV file in the IMDb dataset that provides ratings for titles in\n/// IMDB_BASICS. Each title has at most one rating, and a rating corresponds\n/// to a rank (a decimal in the range 0-10) and the number of votes involved\n/// in creating that rating (from the IMDb web site, presumably).\npub const IMDB_RATINGS: &str = \"title.ratings.tsv\";\n\n/// A type that provides a Display impl for std::time::Duration.\n#[derive(Debug)]\npub struct NiceDuration(pub time::Duration);\n\nimpl fmt::Display for NiceDuration {\n    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {\n        write!(f, \"{:0.4} secs\", self.fractional_seconds())\n    }\n}\n\nimpl NiceDuration {\n    /// Create a duration corresponding to the amount of time since the\n    /// instant given.\n    pub fn since(t: time::Instant) -> NiceDuration {\n        NiceDuration(time::Instant::now().duration_since(t))\n    }\n\n    /// Returns the number of seconds in this duration in fraction form.\n    /// The number to the left of the decimal point is the number of seconds,\n    /// and the number to the right is the number of milliseconds.\n    pub fn fractional_seconds(&self) -> f64 {\n        let fractional = (self.0.subsec_nanos() as f64) / 1_000_000_000.0;\n        self.0.as_secs() as f64 + fractional\n    }\n}\n\n/// A function for creating a CSV reader builder that is pre-loaded with the\n/// correct settings for reading all IMDb CSV files.\npub fn csv_reader_builder() -> csv::ReaderBuilder {\n    let mut builder = csv::ReaderBuilder::new();\n    builder.has_headers(true).delimiter(b'\\t').quoting(false);\n    builder\n}\n\n/// Builds a CSV reader (using `csv_reader_builder`) that is backed by a\n/// seekable memory map.\n///\n/// We use memory maps for this even though we could use a normal `File`, which\n/// is also seekable, because seeking a memory map has very little overhead.\n/// Seeking a `File`, on the other hand, requires a syscall.\npub unsafe fn csv_mmap<P: AsRef<Path>>(\n    path: P,\n) -> Result<csv::Reader<io::Cursor<Mmap>>> {\n    let mmap = mmap_file(path)?;\n    Ok(csv_reader_builder().from_reader(io::Cursor::new(mmap)))\n}\n\n/// Builds a CSV reader (using `csv_reader_builder`) that is backed by a file.\n/// While this read can be seeked, it will be less efficient than using a\n/// memory map. Therefore, this is useful for reading CSV data when no seeking\n/// is needed.\npub fn csv_file<P: AsRef<Path>>(path: P) -> Result<csv::Reader<File>> {\n    let path = path.as_ref();\n    let rdr = csv_reader_builder().from_path(path).map_err(|e| {\n        Error::new(ErrorKind::Csv(format!(\"{}: {}\", path.display(), e)))\n    })?;\n    Ok(rdr)\n}\n\n/// Builds a file-backed memory map.\npub unsafe fn mmap_file<P: AsRef<Path>>(path: P) -> Result<Mmap> {\n    let path = path.as_ref();\n    let file = open_file(path)?;\n    let mmap = Mmap::map(&file).map_err(|e| Error::io_path(e, path))?;\n    Ok(mmap)\n}\n\n/// Creates a file and truncates it.\npub fn create_file<P: AsRef<Path>>(path: P) -> Result<File> {\n    let path = path.as_ref();\n    let file = File::create(path).map_err(|e| Error::io_path(e, path))?;\n    Ok(file)\n}\n\n/// Opens a file for reading.\npub fn open_file<P: AsRef<Path>>(path: P) -> Result<File> {\n    let path = path.as_ref();\n    let file = File::open(path).map_err(|e| Error::io_path(e, path))?;\n    Ok(file)\n}\n\n/// Creates an FST set builder for the given file path.\npub fn fst_set_builder_file<P: AsRef<Path>>(\n    path: P,\n) -> Result<fst::SetBuilder<io::BufWriter<File>>> {\n    let path = path.as_ref();\n    let wtr = io::BufWriter::new(create_file(path)?);\n    let builder = fst::SetBuilder::new(wtr).map_err(|e| {\n        Error::new(ErrorKind::Fst(format!(\"{}: {}\", path.display(), e)))\n    })?;\n    Ok(builder)\n}\n\n/// Open an FST set file for the given file path as a memory map.\npub unsafe fn fst_set_file<P: AsRef<Path>>(path: P) -> Result<fst::Set<Mmap>> {\n    let path = path.as_ref();\n    let file = File::open(path).map_err(|e| Error::io_path(e, path))?;\n    let mmap = Mmap::map(&file).map_err(|e| Error::io_path(e, path))?;\n    let set = fst::Set::new(mmap).map_err(|e| {\n        Error::new(ErrorKind::Fst(format!(\"{}: {}\", path.display(), e)))\n    })?;\n    Ok(set)\n}\n\n/// Creates an FST map builder for the given file path.\npub fn fst_map_builder_file<P: AsRef<Path>>(\n    path: P,\n) -> Result<fst::MapBuilder<io::BufWriter<File>>> {\n    let path = path.as_ref();\n    let wtr = io::BufWriter::new(create_file(path)?);\n    let builder = fst::MapBuilder::new(wtr).map_err(|e| {\n        Error::new(ErrorKind::Fst(format!(\"{}: {}\", path.display(), e)))\n    })?;\n    Ok(builder)\n}\n\n/// Open an FST map file for the given file path as a memory map.\npub unsafe fn fst_map_file<P: AsRef<Path>>(path: P) -> Result<fst::Map<Mmap>> {\n    let path = path.as_ref();\n    let file = File::open(path).map_err(|e| Error::io_path(e, path))?;\n    let mmap = Mmap::map(&file).map_err(|e| Error::io_path(e, path))?;\n    let map = fst::Map::new(mmap).map_err(|e| {\n        Error::new(ErrorKind::Fst(format!(\"{}: {}\", path.display(), e)))\n    })?;\n    Ok(map)\n}\n"
  },
  {
    "path": "rustfmt.toml",
    "content": "max_width = 79\nuse_small_heuristics = \"max\"\n"
  },
  {
    "path": "src/download.rs",
    "content": "use std::fs::{self, File};\nuse std::io;\nuse std::path::{Path, PathBuf};\n\nuse {anyhow::Context, flate2::read::GzDecoder};\n\n/// The base URL to the IMDb data set.\n///\n/// It's not clear if this URL will remain free and open forever, although it\n/// is provided by IMDb proper. If this goes away, we'll need to switch to s3.\nconst IMDB_BASE_URL: &'static str = \"https://datasets.imdbws.com\";\n\n/// All of the data sets we care about.\n///\n/// We leave out cast/crew because we don't need them for renaming files.\nconst DATA_SETS: &'static [&'static str] = &[\n    \"title.akas.tsv.gz\",\n    \"title.basics.tsv.gz\",\n    \"title.episode.tsv.gz\",\n    \"title.ratings.tsv.gz\",\n];\n\n/// Download ensures that all of the IMDb data files exist and have non-zero\n/// size in the given directory. Any path that does not meet these criteria\n/// is fetched from IMDb. Other paths are left untouched.\n///\n/// Returns true if and only if at least one file was downloaded.\npub fn download_all<P: AsRef<Path>>(dir: P) -> anyhow::Result<bool> {\n    let dir = dir.as_ref();\n    fs::create_dir_all(dir)?;\n\n    let nonexistent = non_existent_data_sets(dir)?;\n    for dataset in &nonexistent {\n        download_one(dir, dataset)?;\n    }\n    Ok(nonexistent.len() > 0)\n}\n\n/// Update will update all data set files, regardless of whether they already\n/// exist or not.\npub fn update_all<P: AsRef<Path>>(dir: P) -> anyhow::Result<()> {\n    let dir = dir.as_ref();\n    fs::create_dir_all(dir)?;\n\n    for dataset in DATA_SETS {\n        download_one(dir, dataset)?;\n    }\n    Ok(())\n}\n\n/// Downloads a single data set, decompresses it and writes it to the\n/// corresponding file path in the given directory.\nfn download_one(outdir: &Path, dataset: &'static str) -> anyhow::Result<()> {\n    let outpath = dataset_path(outdir, dataset);\n    let mut outfile = File::create(&outpath)?;\n\n    let url = format!(\"{}/{}\", IMDB_BASE_URL, dataset);\n    log::info!(\"downloading {} to {}\", url, outpath.display());\n    let resp = ureq::get(&url).call().context(\"HTTP error\")?;\n    log::info!(\"sorting CSV records\");\n    write_sorted_csv_records(\n        GzDecoder::new(resp.into_reader()),\n        &mut outfile,\n    )?;\n    Ok(())\n}\n\n/// Gets a list of data sets that either don't exist in the current directory\n/// or have zero size.\nfn non_existent_data_sets(dir: &Path) -> anyhow::Result<Vec<&'static str>> {\n    let mut result = vec![];\n    for &dataset in DATA_SETS {\n        let path = dataset_path(dir, dataset);\n        if fs::metadata(path).map(|md| md.len() == 0).unwrap_or(true) {\n            result.push(dataset);\n        }\n    }\n    Ok(result)\n}\n\n/// Build the path on disk for a dataset, given the directory and the dataset\n/// name.\nfn dataset_path(dir: &Path, name: &'static str) -> PathBuf {\n    let mut path = dir.join(name);\n    // We drop the gz extension since we decompress before writing to disk.\n    path.set_extension(\"\");\n    path\n}\n\n/// Read all CSV data into memory and sort the records in lexicographic order.\n///\n/// This is unfortunately necessary because the IMDb data is no longer sorted\n/// in lexicographic order with respect to the `tt` identifiers. This appears\n/// to be fallout as a result of adding 10 character identifiers (previously,\n/// only 9 character identifiers were used).\nfn write_sorted_csv_records<R: io::Read, W: io::Write>(\n    rdr: R,\n    wtr: W,\n) -> anyhow::Result<()> {\n    use bstr::{io::BufReadExt, ByteSlice};\n    use std::io::Write;\n\n    // We actually only sort the raw lines here instead of parsing CSV records,\n    // since parsing into CSV records has fairly substantial memory overhead.\n    // Since IMDb CSV data never contains a record that spans multiple lines,\n    // this transformation is okay.\n    let rdr = io::BufReader::new(rdr);\n    let mut lines = rdr.byte_lines().collect::<io::Result<Vec<_>>>()?;\n    if lines.is_empty() {\n        anyhow::bail!(\"got empty CSV input\");\n    }\n    // Keep the header record first.\n    lines[1..].sort_unstable();\n\n    let mut wtr = io::BufWriter::new(wtr);\n    let mut prev = None;\n    for (i, line) in lines.iter().enumerate() {\n        // *sigh* ... Looks like the data downloaded is corrupt sometimes,\n        // where there are duplicate rows.\n        let first = match line.split_str(\"\\t\").next() {\n            Some(first) => first,\n            None => anyhow::bail!(\n                \"expected to find one tab-delimited field in '{:?}'\",\n                line.as_bstr(),\n            ),\n        };\n        if i > 0 && prev == Some(first) {\n            continue;\n        }\n        prev = Some(first);\n        wtr.write_all(&line)?;\n        wtr.write_all(b\"\\n\")?;\n    }\n    wtr.flush()?;\n    Ok(())\n}\n"
  },
  {
    "path": "src/logger.rs",
    "content": "// This module defines a super simple logger that works with the `log` crate.\n// We don't need anything fancy; just basic log levels and the ability to\n// print to stderr. We therefore avoid bringing in extra dependencies just\n// for this functionality.\n\nuse log::Log;\n\n/// Initialize a simple logger.\npub fn init() -> anyhow::Result<()> {\n    Ok(Logger::init()?)\n}\n\n/// The simplest possible logger that logs to stderr.\n///\n/// This logger does no filtering. Instead, it relies on the `log` crates\n/// filtering via its global max_level setting.\n#[derive(Debug)]\nstruct Logger(());\n\nconst LOGGER: &'static Logger = &Logger(());\n\nimpl Logger {\n    /// Create a new logger that logs to stderr and initialize it as the\n    /// global logger. If there was a problem setting the logger, then an\n    /// error is returned.\n    fn init() -> std::result::Result<(), log::SetLoggerError> {\n        log::set_logger(LOGGER)\n    }\n}\n\nimpl Log for Logger {\n    fn enabled(&self, _: &log::Metadata) -> bool {\n        // We set the log level via log::set_max_level, so we don't need to\n        // implement filtering here.\n        true\n    }\n\n    fn log(&self, record: &log::Record) {\n        if !should_log(record) {\n            return;\n        }\n        eprintln!(\"{}: {}\", record.level(), record.args());\n    }\n\n    fn flush(&self) {\n        // We use eprintln! which is flushed on every call.\n    }\n}\n\nfn should_log(record: &log::Record) -> bool {\n    let t = record.target();\n    t.starts_with(\"imdb_rename\") || t.starts_with(\"imdb_index\")\n}\n"
  },
  {
    "path": "src/main.rs",
    "content": "use std::env;\nuse std::ffi::OsStr;\nuse std::io::{self, Write};\nuse std::path::PathBuf;\nuse std::process;\n\nuse imdb_index::{Index, IndexBuilder, NgramType, Searcher};\nuse lazy_static::lazy_static;\nuse tabwriter::TabWriter;\nuse walkdir::WalkDir;\n\nuse crate::rename::{RenameAction, RenamerBuilder};\nuse crate::util::{choose, read_yesno, write_tsv};\n\nmod download;\nmod logger;\nmod rename;\nmod util;\n\nfn main() {\n    if let Err(err) = try_main() {\n        // A pipe error occurs when the consumer of this process's output has\n        // hung up. This is a normal event, and we should quit gracefully.\n        if is_pipe_error(&err) {\n            process::exit(0);\n        }\n        eprintln!(\"{:?}\", err);\n        process::exit(1);\n    }\n}\n\nfn try_main() -> anyhow::Result<()> {\n    logger::init()?;\n    log::set_max_level(log::LevelFilter::Info);\n\n    let args = Args::from_matches(&app().get_matches())?;\n    if args.debug {\n        log::set_max_level(log::LevelFilter::Debug);\n    }\n\n    // Forcefully update the data and re-index if requested.\n    if args.update_data {\n        args.download_all_update()?;\n        args.create_index()?;\n        return Ok(());\n    }\n    // Ensure that the necessary data exists.\n    if args.download_all()? || args.update_index {\n        args.create_index()?;\n        if args.update_index {\n            return Ok(());\n        }\n    }\n    // Now ensure that the index exists.\n    if !args.index_dir.exists() {\n        args.create_index()?;\n    }\n\n    let mut searcher = args.searcher()?;\n    let results = match args.query {\n        None => None,\n        Some(ref query) => Some(searcher.search(&query.parse()?)?),\n    };\n    if args.files.is_empty() {\n        let results = match results {\n            None => anyhow::bail!(\"run with a file to rename or --query\"),\n            Some(ref results) => results,\n        };\n        return write_tsv(io::stdout(), &mut searcher, results.as_slice());\n    }\n\n    let mut builder = RenamerBuilder::new();\n    builder\n        .min_votes(args.min_votes)\n        .good_threshold(0.25)\n        .regex_episode(&args.regex_episode)\n        .regex_season(&args.regex_season)\n        .regex_year(&args.regex_year);\n    if let Some(ref results) = results {\n        builder.force(choose(&mut searcher, results.as_slice(), 0.25)?);\n    }\n    let renamer = builder.build()?;\n    let proposals = renamer.propose(\n        &mut searcher,\n        &args.files,\n        args.dest_dir,\n        args.rename_action,\n    )?;\n    if proposals.is_empty() {\n        anyhow::bail!(\"no files to rename\");\n    }\n\n    let mut stdout = TabWriter::new(io::stdout());\n    for p in &proposals {\n        writeln!(stdout, \"{}\\t->\\t{}\", p.src().display(), p.dst().display())?;\n    }\n    stdout.flush()?;\n\n    if read_yesno(&format!(\n        \"Are you sure you want to {action} the above files? (y/n) \",\n        action = &args.rename_action\n    ))? {\n        for p in &proposals {\n            if let Err(err) = p.rename() {\n                eprintln!(\"{}\", err);\n            }\n        }\n    }\n    Ok(())\n}\n\n#[derive(Debug)]\nstruct Args {\n    data_dir: PathBuf,\n    dest_dir: Option<PathBuf>,\n    debug: bool,\n    files: Vec<PathBuf>,\n    index_dir: PathBuf,\n    ngram_size: usize,\n    ngram_type: NgramType,\n    query: Option<String>,\n    regex_episode: String,\n    regex_season: String,\n    regex_year: String,\n    update_data: bool,\n    update_index: bool,\n    min_votes: u32,\n    rename_action: RenameAction,\n}\n\nimpl Args {\n    fn from_matches(matches: &clap::ArgMatches) -> anyhow::Result<Args> {\n        let files = collect_paths(\n            matches\n                .values_of_os(\"file\")\n                .map(|it| it.collect())\n                .unwrap_or(vec![]),\n            matches.is_present(\"follow\"),\n        );\n        let query = matches.value_of_lossy(\"query\").map(|q| q.into_owned());\n        let data_dir =\n            matches.value_of_os(\"data-dir\").map(PathBuf::from).unwrap();\n        let dest_dir = matches.value_of_os(\"dest-dir\").map(PathBuf::from);\n        let index_dir = matches\n            .value_of_os(\"index-dir\")\n            .map(PathBuf::from)\n            .unwrap_or(data_dir.join(\"index\"));\n        let regex_episode =\n            matches.value_of_lossy(\"re-episode\").unwrap().into_owned();\n        let regex_season =\n            matches.value_of_lossy(\"re-season\").unwrap().into_owned();\n        let regex_year =\n            matches.value_of_lossy(\"re-year\").unwrap().into_owned();\n        let min_votes = matches.value_of_lossy(\"votes\").unwrap().parse()?;\n        let rename_action = {\n            if matches.is_present(\"symlink\") {\n                if !cfg!(unix) {\n                    anyhow::bail!(\n                        \"--symlink currently supported only on Unix \\\n                         platforms, try hardlink (-H) instead\"\n                    );\n                }\n                RenameAction::Symlink\n            } else if matches.is_present(\"hardlink\") {\n                RenameAction::Hardlink\n            } else {\n                RenameAction::Rename\n            }\n        };\n        Ok(Args {\n            data_dir: data_dir,\n            dest_dir: dest_dir,\n            debug: matches.is_present(\"debug\"),\n            files: files,\n            index_dir: index_dir,\n            ngram_size: matches\n                .value_of_lossy(\"ngram-size\")\n                .unwrap()\n                .parse()?,\n            ngram_type: matches\n                .value_of_lossy(\"ngram-type\")\n                .unwrap()\n                .parse()?,\n            query: query,\n            regex_episode: regex_episode,\n            regex_season: regex_season,\n            regex_year: regex_year,\n            update_data: matches.is_present(\"update-data\"),\n            update_index: matches.is_present(\"update-index\"),\n            min_votes: min_votes,\n            rename_action: rename_action,\n        })\n    }\n\n    fn create_index(&self) -> anyhow::Result<Index> {\n        Ok(IndexBuilder::new()\n            .ngram_size(self.ngram_size)\n            .ngram_type(self.ngram_type)\n            .create(&self.data_dir, &self.index_dir)?)\n    }\n\n    fn open_index(&self) -> anyhow::Result<Index> {\n        Ok(Index::open(&self.data_dir, &self.index_dir)?)\n    }\n\n    fn searcher(&self) -> anyhow::Result<Searcher> {\n        Ok(Searcher::new(self.open_index()?))\n    }\n\n    fn download_all(&self) -> anyhow::Result<bool> {\n        download::download_all(&self.data_dir)\n    }\n\n    fn download_all_update(&self) -> anyhow::Result<()> {\n        download::update_all(&self.data_dir)\n    }\n}\n\nfn app() -> clap::App<'static, 'static> {\n    use clap::{App, AppSettings, Arg};\n\n    lazy_static! {\n        // clap wants all of its strings tied to a particular lifetime, but\n        // we'd really like to determine some default values dynamically. Using\n        // a lazy_static here is one way of safely giving a static lifetime to\n        // a value that is computed at runtime.\n        //\n        // An alternative approach would be to compute all of our default\n        // values in the caller, and pass them into this function. It's nicer\n        // to defined what we need here though. Locality of reference and all\n        // that.\n        static ref DATA_DIR: PathBuf = env::temp_dir().join(\"imdb-rename\");\n    }\n\n    App::new(\"imdb-rename\")\n        .author(clap::crate_authors!())\n        .version(clap::crate_version!())\n        .max_term_width(100)\n        .setting(AppSettings::UnifiedHelpMessage)\n        .arg(Arg::with_name(\"file\")\n             .multiple(true)\n             .help(\"One or more files to rename.\"))\n        .arg(Arg::with_name(\"data-dir\")\n             .long(\"data-dir\")\n             .env(\"IMDB_RENAME_DATA_DIR\")\n             .takes_value(true)\n             .default_value_os(DATA_DIR.as_os_str())\n             .help(\"The location to store IMDb data files.\"))\n        .arg(Arg::with_name(\"dest-dir\")\n             .long(\"dest-dir\")\n             .short(\"d\")\n             .env(\"IMDB_RENAME_DEST_DIR\")\n             .takes_value(true)\n             .help(\"The output directory of renamed files \\\n                    (or symlinks/hardlinks with the -s/-H options). \\\n                    By default, files are renamed in place.\"))\n        .arg(Arg::with_name(\"debug\")\n             .long(\"debug\")\n             .help(\"Show debug messages. Use this when filing bugs.\"))\n        .arg(Arg::with_name(\"follow\")\n             .long(\"follow\")\n             .short(\"f\")\n             .help(\"Follow directories and attempt to rename all child \\\n                    entries.\"))\n        .arg(Arg::with_name(\"index-dir\")\n             .long(\"index-dir\")\n             .env(\"IMDB_RENAME_INDEX_DIR\")\n             .takes_value(true)\n             .help(\"The location to store IMDb index files. \\\n                    When absent, the default is {data-dir}/index.\"))\n        .arg(Arg::with_name(\"ngram-size\")\n             .long(\"ngram-size\")\n             .default_value(\"3\")\n             .help(\"Choose the ngram size for indexing names. This is only \\\n                    used at index time and otherwise ignored.\"))\n        .arg(Arg::with_name(\"ngram-type\")\n             .long(\"ngram-type\")\n             .default_value(\"window\")\n             .possible_values(NgramType::possible_names())\n             .help(\"Choose the type of ngram generation. This is only used \\\n                    used at index time and otherwise ignored.\"))\n        .arg(Arg::with_name(\"query\")\n             .long(\"query\")\n             .short(\"q\")\n             .takes_value(true)\n             .help(\"Setting an override query is necessary if the file \\\n                    path lacks sufficient information to find a matching \\\n                    title. For example, if a year could not be found. It \\\n                    is also useful for specifying a TV show when renaming \\\n                    multiple episodes at once.\"))\n        .arg(Arg::with_name(\"re-episode\")\n             .long(\"re-episode\")\n             .takes_value(true)\n             .default_value(r\"[Ee](?P<episode>[0-9]+)\")\n             .help(\"A regex for matching episode numbers. The episode number \\\n                    is extracted by looking for a 'episode' capture group.\"))\n        .arg(Arg::with_name(\"re-season\")\n             .long(\"re-season\")\n             .takes_value(true)\n             .default_value(r\"[Ss](?P<season>[0-9]+)\")\n             .help(\"A regex for matching season numbers. The season number \\\n                    is extracted by looking for a 'season' capture group.\"))\n        .arg(Arg::with_name(\"re-year\")\n             .long(\"re-year\")\n             .takes_value(true)\n             .default_value(r\"\\b(?P<year>[0-9]{4})\\b\")\n             .help(\"A regex for matching the year. The year is extracted by \\\n                    looking for a 'year' capture group.\"))\n        .arg(Arg::with_name(\"update-data\")\n             .long(\"update-data\")\n             .help(\"Forcefully refreshes the IMDb data and then exits.\"))\n        .arg(Arg::with_name(\"votes\")\n             .long(\"votes\")\n             .default_value(\"1000\")\n             .help(\"The minimum number of votes required for results matching \\\n                    a query derived from existing file names. This is not \\\n                    applied to explicit queries via the -q/--query flag.\"))\n        .arg(Arg::with_name(\"update-index\")\n             .long(\"update-index\")\n             .help(\"Forcefully re-indexes the IMDb data and then exits.\"))\n        .arg(Arg::with_name(\"symlink\")\n             .long(\"symlink\")\n             .short(\"s\")\n             .conflicts_with(\"hardlink\")\n             .help(\"Create a symlink instead of renaming. \\\n                    (Unix only feature.)\"))\n        .arg(Arg::with_name(\"hardlink\")\n             .long(\"hardlink\")\n             .short(\"H\")\n             .conflicts_with(\"symlink\")\n             .help(\"Create a hardlink instead of renaming. \\\n                    This doesn't work when renaming directories.\"))\n}\n\n/// Collect all file paths from a sequence of OsStrings from the command line.\n/// If `follow` is true, then any paths that are directories are expanded to\n/// include all child paths, recursively.\n///\n/// If there is an error following a path, then it is logged to stderr and\n/// otherwise skipped.\nfn collect_paths(paths: Vec<&OsStr>, follow: bool) -> Vec<PathBuf> {\n    let mut results = vec![];\n    for path in paths {\n        let path = PathBuf::from(path);\n        if !follow || !path.is_dir() {\n            results.push(path);\n            continue;\n        }\n        for result in WalkDir::new(path) {\n            match result {\n                Ok(dent) => results.push(dent.path().to_path_buf()),\n                Err(err) => eprintln!(\"{}\", err),\n            }\n        }\n    }\n    results\n}\n\n/// Return true if and only if an I/O broken pipe error exists in the causal\n/// chain of the given error.\nfn is_pipe_error(err: &anyhow::Error) -> bool {\n    for cause in err.chain() {\n        if let Some(ioerr) = cause.downcast_ref::<io::Error>() {\n            if ioerr.kind() == io::ErrorKind::BrokenPipe {\n                return true;\n            }\n        }\n    }\n    false\n}\n"
  },
  {
    "path": "src/rename.rs",
    "content": "use std::collections::{HashMap, HashSet};\nuse std::fmt;\nuse std::fs;\nuse std::path::{Path, PathBuf};\nuse std::sync::Mutex;\n\nuse imdb_index::{MediaEntity, Query, SearchResults, Searcher, TitleKind};\nuse lazy_static::lazy_static;\nuse regex::Regex;\n\nuse crate::util::choose;\n\n/// A proposal to rename a `src` file path to a `dst` file path.\n#[derive(Clone, Debug)]\npub struct RenameProposal {\n    src: PathBuf,\n    dst: PathBuf,\n    action: RenameAction,\n}\n\n/// The action to take when renaming a file.\n#[derive(Copy, Clone, Debug, PartialEq)]\npub enum RenameAction {\n    /// This does a simple rename of the file.\n    Rename,\n    /// This creates a symlink to the given file.\n    Symlink,\n    /// This creates a hardlink to the given file.\n    Hardlink,\n}\n\nimpl fmt::Display for RenameAction {\n    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {\n        match self {\n            RenameAction::Rename => \"rename\",\n            RenameAction::Symlink => \"symlink\",\n            RenameAction::Hardlink => \"hardlink\",\n        }\n        .fmt(f)\n    }\n}\n\nimpl RenameAction {\n    fn is_link(&self) -> bool {\n        match *self {\n            RenameAction::Rename => false,\n            RenameAction::Symlink | RenameAction::Hardlink => true,\n        }\n    }\n}\n\nimpl RenameProposal {\n    /// Create a new proposal with the given source and destination. The\n    /// destination is constructed by joining `dst_parent` with `dst_name`.\n    /// `dst_name` is sanitized to be safe as a file name.\n    ///\n    /// The given action determines whether to rename the source to the\n    /// destination, create a symlink or create a hardlink.\n    fn new(\n        src: PathBuf,\n        dst_parent: &Path,\n        dst_name: &str,\n        action: RenameAction,\n    ) -> RenameProposal {\n        lazy_static! {\n            static ref RE_BAD_PATH_CHARS: Regex =\n                Regex::new(r\"[\\x00/]\",).unwrap();\n        }\n        let name = RE_BAD_PATH_CHARS.replace_all(dst_name, \"_\");\n\n        RenameProposal { src, dst: dst_parent.join(&*name), action }\n    }\n\n    /// Execute this proposal according to `RenameAction`.\n    pub fn rename(&self) -> anyhow::Result<()> {\n        match self.action {\n            RenameAction::Rename => {\n                fs::rename(&self.src, &self.dst).map_err(|e| {\n                    anyhow::anyhow!(\n                        \"error renaming '{}' to '{}': {}\",\n                        self.src.display(),\n                        self.dst.display(),\n                        e,\n                    )\n                })?;\n            }\n            #[cfg(not(unix))]\n            RenameAction::Symlink => {\n                anyhow::bail!(\"symlinks are only supported for Unix\")\n            }\n            #[cfg(unix)]\n            RenameAction::Symlink => {\n                use std::os::unix;\n\n                unix::fs::symlink(&self.src, &self.dst).map_err(|e| {\n                    anyhow::anyhow!(\n                        \"error symlinking '{}' to '{}': {}\",\n                        self.src.display(),\n                        self.dst.display(),\n                        e,\n                    )\n                })?;\n            }\n            RenameAction::Hardlink => {\n                fs::hard_link(&self.src, &self.dst).map_err(|e| {\n                    anyhow::anyhow!(\n                        \"error hardlinking '{}' to '{}': {}\",\n                        self.src.display(),\n                        self.dst.display(),\n                        e,\n                    )\n                })?;\n            }\n        }\n        Ok(())\n    }\n\n    /// The `src` of this proposal.\n    pub fn src(&self) -> &Path {\n        &self.src\n    }\n\n    /// The `dst` of this proposal.\n    ///\n    /// Note that the destination is cleansed such that it is safe for\n    /// renaming. e.g., If a `/` occurs in an IMDb title, then it is replaced\n    /// with `_`.\n    pub fn dst(&self) -> &Path {\n        &self.dst\n    }\n}\n\n/// A renamer generates file rename proposals based on IMDb.\n///\n/// Fundamentally, a renamer is an entity linker, which attempts to connect\n/// file paths on your system that follow a prescribed pattern with canonical\n/// entity entries in IMDb.\n///\n/// A renamer can be built via a `RenamerBuilder`, and proposals can be\n/// generated via the `propose` method on `Renamer`. A `Renamer` itself never\n/// touches the file system.\n#[derive(Debug)]\npub struct Renamer {\n    cache: Mutex<HashMap<Query, SearchResults<MediaEntity>>>,\n    choose_cache: Mutex<HashMap<Query, MediaEntity>>,\n    force: Option<MediaEntity>,\n    min_votes: u32,\n    good_threshold: f64,\n    episode: Regex,\n    season: Regex,\n    year: Regex,\n}\n\nimpl Renamer {\n    /// Propose a set of renames, where each proposal proposes to rename a\n    /// path in the slice given to a new path using its proper title according\n    /// to IMDb. This never executes any changes to the file system.\n    ///\n    /// This returns an error if any two of the proposals recommend an exactly\n    /// equivalent destination path. An error is also returned if a destination\n    /// path already exists. Finally, the proposals are sorted in descending\n    /// order of path length if any one of them is a directory, which should\n    /// permit changing entries in a directory and a directory itself in one\n    /// go.\n    ///\n    /// An optional destination can be given, which when present, is used as\n    /// the directory in which renames/links are created. Similarly, the action\n    /// given specifies whether the proposal should rename a file, symlink to\n    /// it or hardlink to it.\n    ///\n    /// Note that this may log some types of errors to stderr but otherwise\n    /// continue, which means that the set of proposals returned may not cover\n    /// all paths given. Errors resulting from reading the index will cause an\n    /// error to be returned.\n    pub fn propose(\n        &self,\n        searcher: &mut Searcher,\n        paths: &[PathBuf],\n        dest: Option<PathBuf>,\n        action: RenameAction,\n    ) -> anyhow::Result<Vec<RenameProposal>> {\n        let mut proposals = vec![];\n        for path in paths {\n            let result =\n                self.propose_one(searcher, path, dest.as_deref(), action);\n            let proposal = match result {\n                None => continue,\n                Some(proposal) => proposal,\n            };\n            // If there's no change, then skip it.\n            if proposal.src == proposal.dst {\n                continue;\n            }\n            proposals.push(proposal);\n        }\n\n        // Check that we have no destination duplicates. If we permit them,\n        // then it would be pretty easy to clobber the user's data. That's bad.\n        //\n        // We also make sure that the destination doesn't already exist. This\n        // isn't atomic, but it's probably a fine approximation.\n        let mut seen = HashSet::new();\n        let mut any_dir = false;\n        for p in &proposals {\n            if seen.contains(&p.dst) {\n                anyhow::bail!(\n                    \"duplicate rename proposal for '{}'\",\n                    p.dst.display()\n                );\n            }\n            seen.insert(p.dst.clone());\n            if p.dst.exists() {\n                anyhow::bail!(\n                    \"file path '{}' already exists\",\n                    p.dst.display()\n                );\n            }\n            any_dir = any_dir || p.src.is_dir();\n        }\n        // Finally, sort the proposals such that the longest ones come first.\n        // This should cause child entries to get renamed before parent\n        // entries.\n        if any_dir {\n            proposals.sort_by(|p1, p2| {\n                let (p1, p2) = (p1.dst.as_os_str(), p2.dst.as_os_str());\n                p1.len().cmp(&p2.len()).reverse()\n            });\n        }\n        Ok(proposals)\n    }\n\n    /// Propose a single rename for the given path.\n    ///\n    /// If an error occurs while searching, or if searching yields no results,\n    /// or if an unexpected condition was hit, then an error is logged to\n    /// stderr and `None` is returned.\n    fn propose_one(\n        &self,\n        searcher: &mut Searcher,\n        path: &Path,\n        dest: Option<&Path>,\n        action: RenameAction,\n    ) -> Option<RenameProposal> {\n        let candidate = match self.candidate(path) {\n            Ok(candidate) => candidate,\n            Err(err) => {\n                eprintln!(\"[skipping] could not parse file path: {}\", err);\n                return None;\n            }\n        };\n        let result = match candidate.kind {\n            CandidateKind::Any(ref x) => self.find_any(searcher, x),\n            CandidateKind::Episode(ref x) => self.find_episode(searcher, x),\n            CandidateKind::Unknown => self.find_unknown(),\n        };\n        let ent = match result {\n            Ok(ent) => ent,\n            Err(err) => {\n                eprintln!(\n                    \"[skipping] error searching for {}: {}\",\n                    path.display(),\n                    err,\n                );\n                return None;\n            }\n        };\n\n        // Setup our sources and destinations. They get tweaked depending on\n        // what our rename action is and whether a destination directory was\n        // explicitly given.\n        let dest_name = candidate.path.imdb_name(&ent);\n        let mut src_path = path.to_path_buf();\n        let mut dest_parent_dir =\n            dest.map(|d| d.to_path_buf()).unwrap_or(candidate.path.parent);\n\n        // A symlink was requested to be created in a destination presumably\n        // different than the current directory. This means that the file\n        // specified on the commandline will need to be an absolute path,\n        // otherwise the symlink will not point to the correct place.\n        if dest.is_some() && action == RenameAction::Symlink {\n            src_path = match src_path.canonicalize() {\n                Ok(src_path) => src_path,\n                Err(err) => {\n                    eprintln!(\n                        \"[skipping] error making {} an absolute path: {}\",\n                        src_path.display(),\n                        err,\n                    );\n                    return None;\n                }\n            };\n        }\n        // A symlink or hardlink was requested to be created without a\n        // destination specified. In this case, it only makes sense to place\n        // the symlink in the current directory being executed from, otherwise\n        // potentially relative file paths won't match up.\n        if dest.is_none() && action.is_link() {\n            dest_parent_dir = match std::env::current_dir() {\n                Ok(cwd) => cwd,\n                Err(err) => {\n                    eprintln!(\n                        \"[skipping] error getting current directory: {}\",\n                        err,\n                    );\n                    return None;\n                }\n            };\n        }\n        Some(RenameProposal::new(\n            src_path,\n            &dest_parent_dir,\n            &dest_name,\n            action,\n        ))\n    }\n\n    /// Search for any entity via its name and a year. In general, this is\n    /// enough information to narrow down the results considerably for most\n    /// movies.\n    ///\n    /// If an entity override is provided, then that is returned instead.\n    fn find_any(\n        &self,\n        searcher: &mut Searcher,\n        candidate: &CandidateAny,\n    ) -> anyhow::Result<MediaEntity> {\n        // If we already have an entity override, then just use that to build\n        // the proposal and skip any automatic searches.\n        if let Some(ref ent) = self.force {\n            return Ok(ent.clone());\n        }\n\n        // Otherwise, try to figure out the \"right\" name by constructing a\n        // query from the candidate and searching IMDb.\n        let query = self\n            .name_query(&candidate.title)\n            .year_ge(candidate.year)\n            .year_le(candidate.year)\n            // Basically include every kind except for episode and video games.\n            // This helps filter out a lot of noise.\n            .kind(TitleKind::Movie)\n            .kind(TitleKind::Short)\n            .kind(TitleKind::TVMiniSeries)\n            .kind(TitleKind::TVMovie)\n            .kind(TitleKind::TVSeries)\n            .kind(TitleKind::TVShort)\n            .kind(TitleKind::TVSpecial)\n            .kind(TitleKind::Video)\n            .votes_ge(self.min_votes);\n        log::debug!(\"automatic 'any' query: {:?}\", query);\n        self.choose_one(searcher, &query)\n    }\n\n    /// Search for the episode entity corresponding to the episode information\n    /// in the given candidate. If one couldn't be found, then an error is\n    /// returned.\n    ///\n    /// This works by assuming the candidate episode's name is actually the\n    /// TV show name. So we first look for the TV show entity, and then use\n    /// that to find the corresponding episode.\n    fn find_episode(\n        &self,\n        searcher: &mut Searcher,\n        candidate: &CandidateEpisode,\n    ) -> anyhow::Result<MediaEntity> {\n        let tvshow = self.find_tvshow_for_episode(searcher, candidate)?;\n        let eps =\n            searcher.index().episodes(&tvshow.title().id, candidate.season)?;\n        let ep = match eps\n            .into_iter()\n            .find(|ep| ep.episode == Some(candidate.episode))\n        {\n            Some(ep) => ep,\n            None => anyhow::bail!(\n                \"could not find S{:02}E{:02} for TV show {}\",\n                candidate.season,\n                candidate.episode,\n                tvshow.title().id,\n            ),\n        };\n        match searcher.index().entity(&ep.id)? {\n            Some(ent) => Ok(ent),\n            None => anyhow::bail!(\n                \"could not find media entity for episode {}\",\n                ep.id\n            ),\n        }\n    }\n\n    /// Search for the TV show entity corresponding to the episode information\n    /// in the given candidate. If one couldn't be found, then an error is\n    /// returned.\n    ///\n    /// If there is an entity override, then it is used instead. If the\n    /// override isn't a TV show, then an error is returned.\n    fn find_tvshow_for_episode(\n        &self,\n        searcher: &mut Searcher,\n        candidate: &CandidateEpisode,\n    ) -> anyhow::Result<MediaEntity> {\n        // If we already have an entity override, then just use that as the\n        // TV show. If it isn't a TV show, then return an error.\n        if let Some(ref ent) = self.force {\n            if !ent.title().kind.is_tv_series() {\n                anyhow::bail!(\n                    \"expected TV show to rename episode, but found {}\",\n                    ent.title().kind\n                );\n            }\n            return Ok(ent.clone());\n        }\n\n        // Otherwise, try to figure out the \"right\" TV show by constructing a\n        // query from the candidate and searching IMDb.\n        let query = self\n            .name_query(&candidate.tvshow_title)\n            .kind(TitleKind::TVMiniSeries)\n            .kind(TitleKind::TVSeries)\n            .votes_ge(self.min_votes);\n        log::debug!(\"automatic 'tvshow for episode' query: {:?}\", query);\n        self.choose_one(searcher, &query)\n    }\n\n    /// Return an entity for a completely unknown candidate.\n    ///\n    /// This is invariant with respect to the source path, since we don't\n    /// really know how to interpret it (and if we did, it shouldn't be\n    /// unknown). Therefore, we always defer to the explicit override. If there\n    /// is no override, then this returns an error.\n    ///\n    /// This is useful for renaming files like 'English.srt', where the path\n    /// doesn't contain any useful information and an override is necessary\n    /// anyway.\n    fn find_unknown(&self) -> anyhow::Result<MediaEntity> {\n        match self.force {\n            Some(ref ent) => Ok(ent.clone()),\n            None => {\n                anyhow::bail!(\n                    \"could not parse file path and there is no override \\\n                       set via -q/--query\"\n                );\n            }\n        }\n    }\n\n    /// Produce a structured candidate for renaming from a source path.\n    ///\n    /// The candidate returned represents a heuristic analysis performed on\n    /// the source path, and in particular, represents what we think the path\n    /// represents. Principally, this consists of three categories: TV episode,\n    /// any named title with a year, and then everything else. The type of\n    /// candidate we have determines how we guess its canonical entry in IMDb.\n    fn candidate(&self, path: &Path) -> anyhow::Result<Candidate> {\n        let cpath = CandidatePath::from_path(path)?;\n        let name = cpath.base_name.clone();\n\n        if let Some(cepisode) = self.episode_parts(&cpath)? {\n            return Ok(Candidate {\n                path: cpath,\n                kind: CandidateKind::Episode(cepisode),\n            });\n        }\n\n        let caps_year = match self.year.captures(&name) {\n            None => {\n                return Ok(Candidate {\n                    path: cpath,\n                    kind: CandidateKind::Unknown,\n                })\n            }\n            Some(caps) => caps,\n        };\n        let mat_year = match caps_year.name(\"year\") {\n            None => anyhow::bail!(\"missing 'year' group in: {}\", self.year),\n            Some(mat) => mat,\n        };\n        let year = mat_year.as_str().parse()?;\n        let title = name[..mat_year.start()].to_string();\n        Ok(Candidate {\n            path: cpath,\n            kind: CandidateKind::Any(CandidateAny { title, year }),\n        })\n    }\n\n    /// Part episode information from the given candidate, if it exists.\n    ///\n    /// If a problem occurred (like detecting a match but missing an expected\n    /// capture group name), then an error is returned. If no episode info\n    /// could be found, then `None` is returned.\n    fn episode_parts(\n        &self,\n        cpath: &CandidatePath,\n    ) -> anyhow::Result<Option<CandidateEpisode>> {\n        let name = &cpath.base_name;\n        let caps_season = match self.season.captures(name) {\n            None => return Ok(None),\n            Some(caps) => caps,\n        };\n        let caps_episode = match self.episode.captures(name) {\n            None => return Ok(None),\n            Some(caps) => caps,\n        };\n        let mat_season = match caps_season.name(\"season\") {\n            None => {\n                anyhow::bail!(\"missing 'season' group in: {}\", self.season)\n            }\n            Some(mat) => mat,\n        };\n        let mat_episode = match caps_episode.name(\"episode\") {\n            None => {\n                anyhow::bail!(\"missing 'episode' group in: {}\", self.episode)\n            }\n            Some(mat) => mat,\n        };\n\n        let title_end = caps_season.get(0).unwrap().start();\n        Ok(Some(CandidateEpisode {\n            tvshow_title: name[..title_end].to_string(),\n            season: mat_season.as_str().parse()?,\n            episode: mat_episode.as_str().parse()?,\n        }))\n    }\n\n    /// Build a query and seed it with the given name, after sanitizing the\n    /// name.\n    fn name_query(&self, name: &str) -> Query {\n        let name = name.replace(\".\", \" \");\n        let name = name.trim();\n        log::debug!(\"automatic name query: {:?}\", name);\n        Query::new().name(name)\n    }\n\n    /// Execute a search against the given searcher with the given query and\n    /// choose a single result from the search. If no obvious single result\n    /// stands out, then prompt the user for an answer.\n    ///\n    /// If the given query has been executed before, then returned the cached\n    /// answer.\n    fn choose_one(\n        &self,\n        searcher: &mut Searcher,\n        query: &Query,\n    ) -> anyhow::Result<MediaEntity> {\n        let mut choose_cache = self.choose_cache.lock().unwrap();\n        if let Some(ent) = choose_cache.get(query) {\n            return Ok(ent.clone());\n        }\n        let results = self.search(searcher, query)?;\n        let ent = choose(searcher, results.as_slice(), self.good_threshold)?;\n        choose_cache.insert(query.clone(), ent.clone());\n        Ok(ent)\n    }\n\n    /// Execute a search against the given searcher with the given query.\n    ///\n    /// If this exact query has been previously executed by this renamer, then\n    /// a cache of results are returned.\n    fn search(\n        &self,\n        searcher: &mut Searcher,\n        query: &Query,\n    ) -> anyhow::Result<SearchResults<MediaEntity>> {\n        let mut cache = self.cache.lock().unwrap();\n        if let Some(results) = cache.get(query) {\n            return Ok(results.clone());\n        }\n        let results = searcher.search(query)?;\n        cache.insert(query.clone(), results.clone());\n        Ok(results)\n    }\n}\n\n/// A candidate represents a source file path with additional structured\n/// information that helps us guess what its corresponding canonical IMDb\n/// entity is.\n#[derive(Clone, Debug)]\nstruct Candidate {\n    /// The original path that this candidate was drawn from. The path is\n    /// split up into its parent, name and extension components.\n    path: CandidatePath,\n    /// The type of candidate, with potentially additional information\n    /// depending on the type.\n    kind: CandidateKind,\n}\n\n/// A representation of a source path that we'd like to rename.\n///\n/// It is split up into non-overlapping component pieces to make guessing\n/// easier. In particular, the `parent` and `ext` fields generally aren't\n/// involved in the guessing process, but are used for reassembling a final\n/// proposed file path to rename to. In general, only the `base_name` is used\n/// for guessing.\n///\n/// Note that it is not possible to split every possible path into these\n/// component pieces. Generally, such paths aren't readily guessable, so they\n/// are skipped (with an error message logged to stderr).\n#[derive(Clone, Debug)]\nstruct CandidatePath {\n    /// The parent component of the path. e.g., `/foo` in `/foo/bar.mkv`.\n    parent: PathBuf,\n    /// The base name of this path, minus the extention. e.g., `bar` in\n    /// `/foo/bar.mkv`.\n    base_name: String,\n    /// The extension of this path, if it exists, minus the leading `.`.\n    /// e.g., `mkv` in `/foo/bar.mkv`.\n    ext: Option<String>,\n}\n\n/// Type of a candidate, including any additional type-specific information.\n#[derive(Clone, Debug)]\nenum CandidateKind {\n    /// A general description of any candidate, with a minimal requirement:\n    /// the source file path must contain a year.\n    Any(CandidateAny),\n    /// A description of a candidate that we believe to be an episode, which\n    /// includes the TV show name, the season number and the episode number.\n    Episode(CandidateEpisode),\n    /// Anything else. Generally, these's nothing we can assume about this\n    /// type, but if the user specifies an override, then we'll still be able\n    /// to rename it. If no override is given, then a candidate with this type\n    /// is skipped.\n    Unknown,\n}\n\n/// A general description of any candidate with a name and a year. The name\n/// is generally assumed to be all the text preceding the year in the base name\n/// of a file path.\n///\n/// When we initiate a guess based on this candidate type, we assume it can\n/// correspond to any entity in IMDb except for TV show episodes.\n#[derive(Clone, Debug)]\nstruct CandidateAny {\n    /// The presumed title.\n    title: String,\n    /// The presumed year.\n    year: u32,\n}\n\n/// A description of a candidate that we believe to be an episode. This means\n/// we have captured what we believe to be the TV show's name, along with the\n/// season and episode numbers. The TV show's name is generally assumed to be\n/// all the text preceding the season number in the base name of a file path.\n#[derive(Clone, Debug)]\nstruct CandidateEpisode {\n    /// The presumed TV show title.\n    tvshow_title: String,\n    /// The season number.\n    season: u32,\n    /// The episode number.\n    episode: u32,\n}\n\nimpl CandidatePath {\n    /// Build a candidate path from a source file path. If a path could not\n    /// be built, then an error is returned.\n    fn from_path(path: &Path) -> anyhow::Result<CandidatePath> {\n        let parent = match path.parent() {\n            None => anyhow::bail!(\n                \"{}: has no parent, cannot rename\",\n                path.display()\n            ),\n            Some(parent) => parent.to_path_buf(),\n        };\n        let name_os = match path.file_name() {\n            None => anyhow::bail!(\"{}: missing file name\", path.display()),\n            Some(name_os) => name_os,\n        };\n        let name = match name_os.to_str() {\n            None => anyhow::bail!(\n                \"{}: invalid UTF-8, cannot rename\",\n                path.display()\n            ),\n            Some(name) => name,\n        };\n        let (base_name, ext) = if path.is_dir() {\n            (name.to_string(), None)\n        } else {\n            match name.rfind('.') {\n                None => (name.to_string(), None),\n                Some(i) => {\n                    (name[..i].to_string(), Some(name[i + 1..].to_string()))\n                }\n            }\n        };\n        Ok(CandidatePath { parent, base_name, ext })\n    }\n\n    /// Convert this candidate path to the desired name based on an IMDb\n    /// entity. In general, this replaces the `base_name` of this candidate\n    /// with the title found in the given entity.\n    fn imdb_name(&self, ent: &MediaEntity) -> String {\n        let name = match ent.episode() {\n            Some(ep) => format!(\n                \"S{:02}E{:02} - {}\",\n                ep.season.unwrap_or(0),\n                ep.episode.unwrap_or(0),\n                ent.title().title,\n            ),\n            None => match ent.title().start_year {\n                None => ent.title().title.to_string(),\n                Some(year) => format!(\"{} ({})\", ent.title().title, year),\n            },\n        };\n        match self.ext {\n            None => name,\n            Some(ref ext) => format!(\"{}.{}\", name, ext),\n        }\n    }\n}\n\n/// A builder for configuring a renamer.\n#[derive(Clone, Debug)]\npub struct RenamerBuilder {\n    force: Option<MediaEntity>,\n    min_votes: u32,\n    good_threshold: f64,\n    regex_episode: String,\n    regex_season: String,\n    regex_year: String,\n}\n\nimpl RenamerBuilder {\n    /// Create a `RenamerBuilder` with default settings.\n    pub fn new() -> RenamerBuilder {\n        RenamerBuilder {\n            force: None,\n            min_votes: 1000,\n            good_threshold: 0.25,\n            regex_episode: r\"[Ee](?P<episode>[0-9]+)\".into(),\n            regex_season: r\"[Ss](?P<season>[0-9]+)\".into(),\n            regex_year: r\"\\b(?P<year>[0-9]{4})\\b\".into(),\n        }\n    }\n\n    /// Build a `Renamer` from the current configuration.\n    pub fn build(&self) -> anyhow::Result<Renamer> {\n        Ok(Renamer {\n            cache: Mutex::new(HashMap::new()),\n            choose_cache: Mutex::new(HashMap::new()),\n            force: self.force.clone(),\n            min_votes: self.min_votes,\n            good_threshold: self.good_threshold,\n            episode: Regex::new(&self.regex_episode)?,\n            season: Regex::new(&self.regex_season)?,\n            year: Regex::new(&self.regex_year)?,\n        })\n    }\n\n    /// Forcefully use the given entity when producing rename proposals.\n    ///\n    /// When an entity is given here, the renamer will never execute automatic\n    /// queries based on the file name. Instead, it will rename every path\n    /// given using this entity.\n    ///\n    /// If a path to be renamed is determined to be a TV episode, then this\n    /// entity is assumed to be the entity corresponding to that episode's\n    /// TV show. Otherwise, an error will be returned.\n    pub fn force(&mut self, entity: MediaEntity) -> &mut RenamerBuilder {\n        self.force = Some(entity);\n        self\n    }\n\n    /// Set the minimum number of votes required for all search results from\n    /// automatic queries. This is used when formulating queries based on file\n    /// names that aren't TV episodes. The purpose of this is to heuristically\n    /// filter out noise from the IMDb data.\n    ///\n    /// When this isn't specified, a non-zero default is used.\n    pub fn min_votes(&mut self, min_votes: u32) -> &mut RenamerBuilder {\n        self.min_votes = min_votes;\n        self\n    }\n\n    /// Sets the \"good\" threshold for auto-selection.\n    ///\n    /// When running queries generated from file paths, it is often the case\n    /// that multiple results will be returned. If the difference in score\n    /// between the first result and second result is greater than or equal\n    /// to this threshold, then the first result will be automatically chosen.\n    /// Otherwise, a prompt will be shown to the end user requesting an\n    /// explicit selection.\n    pub fn good_threshold(&mut self, threshold: f64) -> &mut RenamerBuilder {\n        self.good_threshold = threshold;\n        self\n    }\n\n    /// Set the regex for detecting the episode number from a file path.\n    ///\n    /// Regexes are executed against the base name of a path. The episode\n    /// number is extracted via the `episode` named capture group.\n    pub fn regex_episode(&mut self, pattern: &str) -> &mut RenamerBuilder {\n        self.regex_episode = pattern.to_string();\n        self\n    }\n\n    /// Set the regex for detecting the season number from a file path.\n    ///\n    /// Regexes are executed against the base name of a path. The season\n    /// number is extracted via the `season` named capture group.\n    pub fn regex_season(&mut self, pattern: &str) -> &mut RenamerBuilder {\n        self.regex_season = pattern.to_string();\n        self\n    }\n\n    /// Set the regex for detecting the year from a file path.\n    ///\n    /// Regexes are executed against the base name of a path. The year is\n    /// extracted via the `year` named capture group.\n    pub fn regex_year(&mut self, pattern: &str) -> &mut RenamerBuilder {\n        self.regex_year = pattern.to_string();\n        self\n    }\n}\n\nimpl Default for RenamerBuilder {\n    fn default() -> RenamerBuilder {\n        RenamerBuilder::new()\n    }\n}\n"
  },
  {
    "path": "src/util.rs",
    "content": "use std::io::{self, Write};\n\nuse imdb_index::{Episode, MediaEntity, Scored, Searcher, Title};\nuse tabwriter::TabWriter;\n\n/// Make a choice among the search results given.\n///\n/// If there is no clear winner, then a prompt is shown to the end user, where\n/// they must make a selection. If a selection is absent or invalid, then an\n/// error is returned.\n///\n/// The threshold given determines the automatic selection criteria. Namely,\n/// if the difference of scores between the first and second results is\n/// greater than or equal to the given threshold, then the first result is\n/// returned without prompted the end user.\npub fn choose(\n    searcher: &mut Searcher,\n    results: &[Scored<MediaEntity>],\n    good_threshold: f64,\n) -> anyhow::Result<MediaEntity> {\n    if results.is_empty() {\n        anyhow::bail!(\"no search results available for query\");\n    } else if results.len() == 1 {\n        return Ok(results[0].clone().into_value());\n    } else if (results[0].score() - results[1].score()) >= good_threshold {\n        return Ok(results[0].clone().into_value());\n    }\n\n    write_tsv(io::stdout(), searcher, results)?;\n    let choice = read_number(1, results.len())?;\n    Ok(results[choice - 1].clone().into_value())\n}\n\n/// Reads a number from stdin in the given inclusive range.\npub fn read_number(start: usize, end: usize) -> anyhow::Result<usize> {\n    let mut stdout = io::stdout();\n    write!(stdout, \"Please enter your choice [{}-{}]: \", start, end)?;\n    stdout.flush()?;\n\n    let mut response = String::new();\n    io::stdin().read_line(&mut response)?;\n    let choice: usize = response.trim().parse()?;\n    if choice < start || choice > end {\n        anyhow::bail!(\n            \"invalid choice: {} is not in range [{}-{}]\",\n            choice,\n            start,\n            end\n        );\n    }\n    Ok(choice)\n}\n\n/// Reads a yes/no answer from stdin. This is flexible and recognizes\n/// y, Y, yes, YES as 'yes' answers. Everything else is recognized as a 'no'\n/// answer.\npub fn read_yesno(msg: &str) -> anyhow::Result<bool> {\n    let mut stdout = io::stdout();\n    write!(stdout, \"{}\", msg)?;\n    stdout.flush()?;\n\n    let mut response = String::new();\n    io::stdin().read_line(&mut response)?;\n    let answer = response.trim().to_lowercase();\n    Ok(answer == \"y\" || answer == \"yes\")\n}\n\n/// Write the given result set to the given writer.\n///\n/// If a result is an episode, then the index given is used to look up relevant\n/// info about its TV show, if one could be found, and include that information\n/// in the output.\npub fn write_tsv<W: io::Write>(\n    wtr: W,\n    searcher: &mut Searcher,\n    results: &[Scored<MediaEntity>],\n) -> anyhow::Result<()> {\n    let mut wtr = TabWriter::new(wtr).minwidth(4);\n    writeln!(wtr, \"#\\tscore\\tid\\tkind\\ttitle\\tyear\\ttv\")?;\n    for (i, sr) in results.iter().enumerate() {\n        let (score, ent) = (sr.score(), sr.value());\n        if let Some(ep) = ent.episode() {\n            match searcher.index().title(&ep.tvshow_id)? {\n                None => write_tsv_title(&mut wtr, i + 1, score, ent)?,\n                Some(tvshow) => {\n                    write_tsv_episode(\n                        &mut wtr,\n                        i + 1,\n                        score,\n                        ent,\n                        &tvshow,\n                        ep,\n                    )?;\n                }\n            }\n        } else {\n            write_tsv_title(&mut wtr, i + 1, score, ent)?;\n        }\n    }\n    wtr.flush()?;\n    Ok(())\n}\n\nfn write_tsv_title<W: io::Write>(\n    mut wtr: W,\n    position: usize,\n    score: f64,\n    ent: &MediaEntity,\n) -> anyhow::Result<()> {\n    write!(\n        wtr,\n        \"{}\\t{:0.3}\\t{}\\t{}\\t{}\\t{}\",\n        position,\n        score,\n        ent.title().id,\n        ent.title().kind,\n        ent.title().title,\n        ent.title()\n            .start_year\n            .map(|y| y.to_string())\n            .unwrap_or(\"N/A\".to_string()),\n    )?;\n    write!(wtr, \"\\n\")?;\n    Ok(())\n}\n\nfn write_tsv_episode<W: io::Write>(\n    mut wtr: W,\n    position: usize,\n    score: f64,\n    ent: &MediaEntity,\n    tvshow: &Title,\n    ep: &Episode,\n) -> anyhow::Result<()> {\n    let tvinfo = format!(\n        \"S{:02}E{:02} {}\",\n        ep.season.unwrap_or(0),\n        ep.episode.unwrap_or(0),\n        tvshow.title,\n    );\n    write!(\n        wtr,\n        \"{}\\t{:0.3}\\t{}\\t{}\\t{}\\t{}\\t{}\",\n        position,\n        score,\n        ent.title().id,\n        ent.title().kind,\n        ent.title().title,\n        ent.title()\n            .start_year\n            .map(|y| y.to_string())\n            .unwrap_or(\"N/A\".to_string()),\n        tvinfo,\n    )?;\n    write!(wtr, \"\\n\")?;\n    Ok(())\n}\n"
  }
]