Repository: BurntSushi/imdb-rename Branch: master Commit: f4180e5d89b5 Files: 46 Total size: 289.3 KB Directory structure: gitextract_oovomjyk/ ├── .github/ │ ├── FUNDING.yml │ └── workflows/ │ └── ci.yml ├── .gitignore ├── COPYING ├── Cargo.toml ├── LICENSE-MIT ├── README.md ├── UNLICENSE ├── data/ │ ├── eval/ │ │ └── truth.toml │ └── test/ │ └── small/ │ ├── title.akas.tsv │ ├── title.basics.tsv │ ├── title.episode.tsv │ └── title.ratings.tsv ├── imdb-eval/ │ ├── COPYING │ ├── Cargo.toml │ ├── LICENSE-MIT │ ├── README.md │ ├── UNLICENSE │ └── src/ │ ├── eval.rs │ ├── logger.rs │ └── main.rs ├── imdb-index/ │ ├── COPYING │ ├── Cargo.toml │ ├── LICENSE-MIT │ ├── README.md │ ├── UNLICENSE │ └── src/ │ ├── error.rs │ ├── index/ │ │ ├── aka.rs │ │ ├── episode.rs │ │ ├── id.rs │ │ ├── mod.rs │ │ ├── names.rs │ │ ├── rating.rs │ │ ├── tests.rs │ │ └── writer.rs │ ├── lib.rs │ ├── record.rs │ ├── scored.rs │ ├── search.rs │ └── util.rs ├── rustfmt.toml └── src/ ├── download.rs ├── logger.rs ├── main.rs ├── rename.rs └── util.rs ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/FUNDING.yml ================================================ github: [BurntSushi] ================================================ FILE: .github/workflows/ci.yml ================================================ name: ci on: pull_request: push: branches: - master schedule: - cron: '00 01 * * *' # The section is needed to drop write-all permissions that are granted on # `schedule` event. By specifying any permission explicitly all others are set # to none. By using the principle of least privilege the damage a compromised # workflow can do (because of an injection or compromised third party tool or # action) is restricted. Currently the worklow doesn't need any additional # permission except for pulling the code. Adding labels to issues, commenting # on pull-requests, etc. may need additional permissions: # # Syntax for this section: # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#permissions # # Reference for how to assign permissions on a job-by-job basis: # https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs # # Reference for available permissions that we can enable if needed: # https://docs.github.com/en/actions/security-guides/automatic-token-authentication#permissions-for-the-github_token permissions: # to fetch code (actions/checkout) contents: read jobs: test: runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: include: - build: stable os: ubuntu-latest rust: stable - build: beta os: ubuntu-latest rust: beta - build: nightly os: ubuntu-latest rust: nightly - build: macos os: macos-latest rust: stable - build: win-msvc os: windows-latest rust: stable - build: win-gnu os: windows-latest rust: stable-x86_64-gnu env: RUSTFLAGS: -D warnings RUST_BACKTRACE: 1 steps: - name: Checkout repository uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.rust }} - run: cargo build --all --verbose - run: cargo doc --all --verbose - run: cargo test --all --verbose rustfmt: runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: toolchain: stable components: rustfmt - name: Check formatting run: cargo fmt --all --check ================================================ FILE: .gitignore ================================================ /target /imdb-eval/target /imdb-index/target **/*.rs.bk tags /tmp ================================================ FILE: COPYING ================================================ This project is dual-licensed under the Unlicense and MIT licenses. You may use this code under the terms of either license. ================================================ FILE: Cargo.toml ================================================ [package] name = "imdb-rename" version = "0.1.6" #:version authors = ["Andrew Gallant "] description = """ A command line utility for searching IMDb and renaming your media files. """ documentation = "https://github.com/BurntSushi/imdb-rename" homepage = "https://github.com/BurntSushi/imdb-rename" repository = "https://github.com/BurntSushi/imdb-rename" readme = "README.md" keywords = ["imdb", "movie", "index", "search", "name"] license = "Unlicense/MIT" edition = "2021" [workspace] members = ["imdb-eval", "imdb-index"] [dependencies] anyhow = "1.0.75" bstr = { version = "1.8.0", default-features = false, features = ["std"] } clap = { version = "2.34.0", default-features = false } flate2 = "1.0.28" imdb-index = { version = "0.1.4", path = "imdb-index" } lazy_static = "1.4.0" log = { version = "0.4.20", features = ["std"] } regex = "1.10.2" tabwriter = "1.3.0" ureq = { version = "2.9.1", default-features = false, features = ["tls"] } walkdir = "2.4.0" [profile.release] debug = true ================================================ FILE: LICENSE-MIT ================================================ The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ imdb-rename =========== A command line tool to rename media files based on titles from IMDb. imdb-rename downloads the official IMDb data set and creates a local index to use for fast fuzzy searching. [![Linux build status](https://api.travis-ci.org/BurntSushi/imdb-rename.svg)](https://travis-ci.org/BurntSushi/imdb-rename) [![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/imdb-rename?svg=true)](https://ci.appveyor.com/project/BurntSushi/imdb-rename) [![](http://meritbadge.herokuapp.com/imdb-rename)](https://crates.io/crates/imdb-rename) Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). ### Installation **[Archives of precompiled binaries for imdb-rename are available for Windows, macOS and Linux.](https://github.com/BurntSushi/imdb-rename/releases)** Otherwise, users are expected to compile imdb-rename from source: ``` $ git clone https://github.com/BurntSushi/imdb-rename $ cd imdb-rename $ cargo build --release $ ./target/release/imdb-rename --help ``` Alternatively, if you have [Cargo installed](https://rustup.rs), then you can install imdb-rename directly from [crates.io](https://crates.io): ``` $ cargo install imdb-rename ``` imdb-rename's minimum supported Rust version is **1.28.0**. #### Archlinux An aur package is available: [imdb-rename](https://aur.archlinux.org/packages/imdb-rename/). ### Quick example Ever since Season 1 of The Simpsons came out on DVD, I've been collecting them and ripping them on to my hard drive. My process is somewhat manual, but I wind up with a directory that looks like this: ``` S18E01.mkv S18E05.mkv S18E09.mkv S18E13.mkv S18E17.mkv S18E21.mkv S18E02.mkv S18E06.mkv S18E10.mkv S18E14.mkv S18E18.mkv S18E22.mkv S18E03.mkv S18E07.mkv S18E11.mkv S18E15.mkv S18E19.mkv S18E04.mkv S18E08.mkv S18E12.mkv S18E16.mkv S18E20.mkv ``` It would be much nicer if these files had their proper episode titles. imdb-rename can rename these files automatically using episode titles from IMDb: ``` $ imdb-rename -q 'the simpsons {show}' *.mkv ``` This command ran a query with the `-q` flag to identify the TV show, provided the files to rename, and... presto! ``` S18E01 - The Mook, the Chef, the Wife and Her Homer.mkv S18E02 - Jazzy & The Pussycats.mkv S18E03 - Please Homer, Don't Hammer 'Em.mkv S18E04 - Treehouse of Horror XVII.mkv S18E05 - G.I. (Annoyed Grunt).mkv S18E06 - Moe'N'a Lisa.mkv S18E07 - Ice Cream of Margie: With the Light Blue Hair.mkv S18E08 - The Haw-Hawed Couple.mkv S18E09 - Kill Gil, Vol. 1 & 2.mkv S18E10 - The Wife Aquatic.mkv S18E11 - Revenge Is a Dish Best Served Three Times.mkv S18E12 - Little Big Girl.mkv S18E13 - Springfield Up.mkv S18E14 - Yokel Chords.mkv S18E15 - Rome-old and Juli-eh.mkv S18E16 - Homerazzi.mkv S18E17 - Marge Gamer.mkv S18E18 - The Boys of Bummer.mkv S18E19 - Crook and Ladder.mkv S18E20 - Stop or My Dog Will Shoot.mkv S18E21 - 24 Minutes.mkv S18E22 - You Kent Always Say What You Want.mkv ``` ### Fancier example imdb-rename isn't limited to just renaming TV episodes based on season/episode numbers. It can also perform a fuzzy match based on the contents of the file name. For example, given this file: ``` Thor.Ragnarok.2017.1080p.WEB-DL.DD5.1.H264-FGT.mkv ``` We can "clean it up" and rename it to a nice title like so: ``` $ imdb-rename Thor.Ragnarok.2017.1080p.WEB-DL.DD5.1.H264-FGT.mkv ``` which gives us: ``` Thor: Ragnarok (2017).mkv ``` ### Freeform searching We can also use imdb-rename to search IMDb, which is the default behavior when a `-q/--query` is provided without any file names: ``` $ imdb-rename -q 'homey loves flanders' # score id kind title year tv 1 1.000 tt0773646 tvEpisode Homer Loves Flanders 1994 S05E16 The Simpsons 2 0.646 tt2101691 tvEpisode Tiny Loves Flowers N/A S02E08 Dinosaur Train 3 0.568 tt3203408 tvEpisode Courtney Loves Love 2014 S01E05 Courtney Loves Dallas 4 0.561 tt1722576 short In Flanders Fields 2010 5 0.561 tt2253780 tvSeries In Vlaamse Velden 2014 6 0.555 tt4528474 video My Lovely Homeland 2011 7 0.551 tt0220646 tvMovie Moll Flanders 1975 [... results truncated ...] ``` Notice that our query had a typo in it. imdb-rename does its best to find the most relevant results. It is also fast. Even though the above query searches through all 6 million names in IMDb, it runs in under 100ms. This is thanks to using an inverted index memory mapped from disk. ### How does it work? imdb-rename works by downloading [approved datasets from IMDb](https://www.imdb.com/interfaces/), and creating an inverted index based on ngrams extracted from the names in IMDb's data. The inverted index provides a quick way to search and rank results using techniques from [information retrieval](https://nlp.stanford.edu/IR-book/) such as [Okapi-BM25](https://en.wikipedia.org/wiki/Okapi_BM25). ### Motivation My motivation for building this tool is somewhat idiosyncratic, but three-fold: 1. I find it very convenient to have a tool to rename media files automatically. imdb-rename is my third iteration on this tool. The first was an unpublished hodge podge of Python scripts and a MySQL database. The second was a [Go program with a PostgreSQL database](https://github.com/BurntSushi/goim). The Go program served me well, but IMDb retired their old data format, which required me to build a new tool to adapt. 2. I've been working on a low-level information retrieval library off-and-on for a couple years, and initially built this tool on top of that library as a form of dogfooding. It didn't work out as well as I'd hoped, so I scrapped the generic library and built out a specific solution tailored to IMDb. I'm no longer dogfooding directly, but I've established a useful baseline. 3. I want more people to learn about information retrieval, and I believe this tool can serve to teach others. In particular, imdb-rename is a complete end-to-end information retrieval system that is fast, solves a real problem, is only a few thousand lines of code and comes with a built-in evaluation that is easy to run. This tool is perhaps a bit over engineered, but I had fun with it. Believe it or not, parts of imdb-rename are intentionally simple at the cost of both query speed and size on disk! ### Evaluation It is possible to run an evaluation to compare the various parameters available for searching. The evaluation system is available as a separate tool called imdb-eval, which is included in this repository. To use it, we must first build it: ``` $ git clone https://github.com/BurntSushi/imdb-rename $ cd imdb-rename $ cargo build --release --all $ ./target/release/imdb-eval --help ``` Running an evaluation is simple. We can run an evaluation on all combinations of scorer and similarity function, along with ngram sizes of 3 and 4 like so: (This will use truth data that is built into the `imdb-eval` binary.) ``` $ ./target/release/imdb-eval --ngram-size 3 --ngram-size 4 | tee eval.csv ``` This will output the results of running a search on every item in the truth data. The results include the rank of the expected answer. The results can be summarized into a single score called the [Mean Reciprocal Rank](https://en.wikipedia.org/wiki/Mean_reciprocal_rank) (which is itself a specific instance of MAP, or mean average precision) with the `--summarize` flag like so: ``` $ ./target/release/imdb-eval --summarize eval.csv ``` If you have [xsv](https://github.com/BurntSushi/xsv) installed, then the results can be easily sorted and formatted: ``` $ ./target/release/imdb-eval --summarize eval.csv | xsv sort -R -s mrr | xsv table ``` If you want to tweak the truth data, then you might consider starting with the bundled truth data (assuming you're at the root of the imdb-rename repository): ``` $ $EDITOR data/eval/truth.toml $ ./target/release/imdb-eval --ngram-size 3 --ngram-size 4 --truth data/eval/truth.toml ``` ### What does this tool not do? imdb-rename is tool for renaming media files, and to the extent that searching IMDb facilitates renaming files, it is also a search tool. There is no intent to develop this further to explore all IMDb data, such as cast/crew information. Folks interested in building a different type of IMDb tool may be interested in the [`imdb-index`](https://docs.rs/imdb-index) crate, which provides programmatic access to the index created by imdb-rename. ### IMDb licensing The data used by imdb-rename is retrieved from [IMDb datasets](https://www.imdb.com/interfaces/). In particular, imdb-rename will never scrape imdb.com, and only uses the data provided by IMDb in the `tsv` files. Additionally, imdb-rename must only be used for non-commercial and personal uses. ================================================ FILE: UNLICENSE ================================================ This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to ================================================ FILE: data/eval/truth.toml ================================================ [[task]] query = "the matrix" answer = "tt0133093" [[task]] query = "homey the clown" answer = "tt0701128" [[task]] query = "homer loves" answer = "tt0773646" [[task]] query = "the matrix: revolutions" answer = "tt0242653" [[task]] query = "troy" answer = "tt0332452" [[task]] query = "o" answer = "tt0184791" [[task]] query = "love and basketball" answer = "tt0199725" [[task]] query = "the last one" answer = "tt0583434" [[task]] query = "pre-destination" answer = "tt2397535" [[task]] query = "1 magic christmas" answer = "tt0089731" [[task]] query = "xmen the last stand" answer = "tt0376994" [[task]] query = "todliche aura" answer = "tt0583427" [[task]] query = "her" answer = "tt1798709" [[task]] query = "its a wonderful life" answer = "tt0038650" [[task]] query = "jason born" answer = "tt4196776" [[task]] query = "cpt america first avenger" answer = "tt0458339" [[task]] query = "batman vs superman dawn justice" answer = "tt2975590" [[task]] query = "nightmare before christmas" answer = "tt0107688" [[task]] query = "the man from earth" answer = "tt0756683" [[task]] query = "amazing spiderman 2" answer = "tt1872181" [[task]] query = "the revanant" answer = "tt1663202" [[task]] query = "imaginarium of dr" answer = "tt1054606" [[task]] query = "the dark night" answer = "tt0468569" [[task]] query = "the simpsons" answer = "tt0462538" [[task]] query = "into the bad lands" answer = "tt3865236" [[task]] query = "south park bigger" answer = "tt0158983" [[task]] query = "game of shadows sherlock" answer = "tt1515091" [[task]] query = "ragnarok" answer = "tt3501632" [[task]] query = "riddick" answer = "tt0296572" [[task]] query = "voyage dawn treader" answer = "tt0980970" [[task]] query = "phenomonon" answer = "tt0117333" [[task]] query = "ratchet and clank" answer = "tt2865120" [[task]] query = "spiderman homecoming" answer = "tt2250912" [[task]] query = "sixth sense" answer = "tt0167404" [[task]] query = "there will be blood" answer = "tt0469494" [[task]] query = "gangs new york" answer = "tt0217505" [[task]] query = "first avenger" answer = "tt0458339" [[task]] query = "good shepherd" answer = "tt0343737" [[task]] query = "gone with the wind" answer = "tt0031381" [[task]] query = "bourne identity" answer = "tt0258463" [[task]] query = "seinfeld" answer = "tt0098904" [[task]] query = "lincoln" answer = "tt0443272" [[task]] query = "sherlock" answer = "tt1475582" [[task]] query = "skinner's badass song" answer = "tt0777150" [[task]] query = "flying hellish" answer = "tt0778451" [[task]] query = "springfield files" answer = "tt0701263" [[task]] query = "shot mr burns" answer = "tt0701295" [[task]] query = "camp krusty" answer = "tt0701142" [[task]] query = "the monorail" answer = "tt0701173" [[task]] query = "king homer" answer = "tt0701144" [[task]] query = "mr. plow" answer = "tt0701184" ================================================ FILE: data/test/small/title.akas.tsv ================================================ titleId ordering title region language types attributes isOriginalTitle tt0096697 10 Simpsonovi SI \N imdbDisplay \N 0 tt0096697 11 Simpsonovi RS \N imdbDisplay \N 0 tt0096697 12 The Simpsons US \N \N \N 0 tt0096697 13 Gia Dinh Simpsons VN \N imdbDisplay \N 0 tt0096697 14 Simpsonovci SK \N \N \N 0 tt0096697 15 Os Simpsons BR \N \N \N 0 tt0096697 16 Simpsons SE \N imdbDisplay \N 0 tt0096697 17 Simpsoni HR \N \N \N 0 tt0096697 18 Simpsoni LV \N imdbDisplay \N 0 tt0096697 19 Die Simpsons XWG \N \N \N 0 tt0096697 1 Los Simpson MX \N \N \N 0 tt0096697 20 Simpsonovi CSHH \N imdbDisplay \N 0 tt0096697 21 Семейство Симпсън BG bg \N \N 0 tt0096697 22 Els Simpson ES ca imdbDisplay \N 0 tt0096697 23 The Simpsons GR \N \N \N 0 tt0096697 24 Сiмпсони UA \N \N \N 0 tt0096697 25 Simpsonid EE \N \N \N 0 tt0096697 26 Los Simpson ES \N imdbDisplay \N 0 tt0096697 27 Simpsonowie PL \N imdbDisplay \N 0 tt0096697 28 Os Simpsons PT \N \N \N 0 tt0096697 29 I Simpson IT \N \N \N 0 tt0096697 2 The Simpsons \N \N original \N 1 tt0096697 30 Les Simpson CA fr \N dubbed version 0 tt0096697 31 Simpsons NO \N \N \N 0 tt0096697 32 A Simpson család HU \N \N \N 0 tt0096697 33 Al shamshoon EG ar \N dubbed version 0 tt0096697 34 Die Simpsons DE \N imdbDisplay \N 0 tt0096697 35 Familia Simpson RO \N \N \N 0 tt0096697 36 Los Simpson PE \N imdbDisplay \N 0 tt0096697 37 Simpsonai LT \N imdbDisplay \N 0 tt0096697 38 Les Simpson FR \N \N \N 0 tt0096697 3 Los Simpson AR \N \N \N 0 tt0096697 4 Симпсоны RU \N \N \N 0 tt0096697 5 Los Simpson VE \N \N \N 0 tt0096697 6 Simpson Ailesi TR tr imdbDisplay \N 0 tt0096697 7 Simpsons DK \N \N \N 0 tt0096697 8 Simpsonit FI \N \N \N 0 tt0096697 9 Simpsonovi CZ \N imdbDisplay \N 0 ================================================ FILE: data/test/small/title.basics.tsv ================================================ tconst titleType primaryTitle originalTitle isAdult startYear endYear runtimeMinutes genres tt0348034 tvEpisode Simpsons Roasting on an Open Fire Simpsons Roasting on an Open Fire 0 1989 \N 30 Animation,Comedy tt0701059 tvEpisode Bart the General Bart the General 0 1990 \N 30 Animation,Comedy tt0701060 tvEpisode Bart the Murderer Bart the Murderer 0 1991 \N 30 Animation,Comedy tt0701062 tvEpisode Bart vs. Thanksgiving Bart vs. Thanksgiving 0 1990 \N 23 Animation,Comedy tt0701063 tvEpisode Bart's Dog Gets an F Bart's Dog Gets an F 0 1991 \N 23 Animation,Comedy tt0701064 tvEpisode Bart's Friend Falls in Love Bart's Friend Falls in Love 0 1992 \N 30 Animation,Comedy tt0701070 tvEpisode Black Widower Black Widower 0 1992 \N 30 Animation,Comedy tt0701076 tvEpisode Brother, Can You Spare Two Dimes? Brother, Can You Spare Two Dimes? 0 1992 \N 30 Animation,Comedy tt0701077 tvEpisode Brush with Greatness Brush with Greatness 0 1991 \N 30 Animation,Comedy tt0701082 tvEpisode Colonel Homer Colonel Homer 0 1992 \N 30 Animation,Comedy tt0701084 tvEpisode Dancin' Homer Dancin' Homer 0 1990 \N 30 Animation,Comedy tt0701098 tvEpisode Flaming Moe's Flaming Moe's 0 1991 \N 30 Animation,Comedy tt0701110 tvEpisode Homer Defined Homer Defined 0 1991 \N 30 Animation,Comedy tt0701114 tvEpisode Homer at the Bat Homer at the Bat 0 1992 \N 30 Animation,Comedy tt0701123 tvEpisode Homer's Night Out Homer's Night Out 0 1990 \N 30 Animation,Comedy tt0701124 tvEpisode Homer's Odyssey Homer's Odyssey 0 1990 \N 30 Animation,Comedy tt0701140 tvEpisode Itchy and Scratchy and Marge Itchy and Scratchy and Marge 0 1990 \N 23 Animation,Comedy tt0701147 tvEpisode Krusty Gets Busted Krusty Gets Busted 0 1990 \N 30 Animation,Comedy tt0701152 tvEpisode Life on the Fast Lane Life on the Fast Lane 0 1990 \N 30 Animation,Comedy tt0701153 tvEpisode Like Father, Like Clown Like Father, Like Clown 0 1991 \N 30 Animation,Comedy tt0701161 tvEpisode Lisa's Pony Lisa's Pony 0 1991 \N 30 Animation,Comedy tt0701164 tvEpisode Lisa's Substitute Lisa's Substitute 0 1991 \N 30 Animation,Comedy tt0701178 tvEpisode Moaning Lisa Moaning Lisa 0 1990 \N 30 Animation,Comedy tt0701183 tvEpisode Mr. Lisa Goes to Washington Mr. Lisa Goes to Washington 0 1991 \N 30 Animation,Comedy tt0701191 tvEpisode Oh Brother, Where Art Thou? Oh Brother, Where Art Thou? 0 1991 \N 23 Animation,Comedy tt0701192 tvEpisode Old Money Old Money 0 1991 \N 23 Animation,Comedy tt0701195 tvEpisode One Fish, Two Fish, Blowfish, Blue Fish One Fish, Two Fish, Blowfish, Blue Fish 0 1991 \N 23 Animation,Comedy tt0701200 tvEpisode Radio Bart Radio Bart 0 1992 \N 30 Animation,Comedy tt0701204 tvEpisode Separate Vocations Separate Vocations 0 1992 \N 30 Animation,Comedy tt0701211 tvEpisode Simpson and Delilah Simpson and Delilah 0 1990 \N 23 Animation,Comedy tt0701215 tvEpisode Some Enchanted Evening Some Enchanted Evening 0 1990 \N 30 Animation,Comedy tt0701217 tvEpisode Stark Raving Dad Stark Raving Dad 0 1991 \N 30 Animation,Comedy tt0701228 tvEpisode The Call of the Simpsons The Call of the Simpsons 0 1990 \N 30 Animation,Comedy tt0701232 tvEpisode The Crepes of Wrath The Crepes of Wrath 0 1990 \N 30 Animation,Comedy tt0701254 tvEpisode The Otto Show The Otto Show 0 1992 \N 30 Animation,Comedy tt0701269 tvEpisode The Way We Was The Way We Was 0 1991 \N 23 Animation,Comedy tt0701275 tvEpisode Three Men and a Comic Book Three Men and a Comic Book 0 1991 \N 30 Animation,Comedy tt0701278 tvEpisode Treehouse of Horror Treehouse of Horror 0 1990 \N 30 Animation,Comedy tt0756398 tvEpisode The Telltale Head The Telltale Head 0 1990 \N 30 Animation,Comedy tt0756399 tvEpisode There's No Disgrace Like Home There's No Disgrace Like Home 0 1990 \N 30 Animation,Comedy tt0756593 tvEpisode Bart the Genius Bart the Genius 0 1990 \N 30 Animation,Comedy tt0757017 tvEpisode Bart Gets Hit by a Car Bart Gets Hit by a Car 0 1991 \N 23 Animation,Comedy tt0757023 tvEpisode Two Cars in Every Garage and Three Eyes on Every Fish Two Cars in Every Garage and Three Eyes on Every Fish 0 1990 \N 23 Animation,Comedy tt0759267 tvEpisode Treehouse of Horror II Treehouse of Horror II 0 1991 \N 30 Animation,Comedy tt0763024 tvEpisode Bart Gets an F Bart Gets an F 0 1990 \N 30 Animation,Comedy tt0763042 tvEpisode When Flanders Failed When Flanders Failed 0 1991 \N 30 Animation,Comedy tt0766140 tvEpisode The War of the Simpsons The War of the Simpsons 0 1991 \N 30 Animation,Comedy tt0767438 tvEpisode Bart the Daredevil Bart the Daredevil 0 1990 \N 23 Animation,Comedy tt0767440 tvEpisode Blood Feud Blood Feud 0 1991 \N 30 Animation,Comedy tt0767442 tvEpisode Dead Putting Society Dead Putting Society 0 1990 \N 30 Animation,Comedy tt0767443 tvEpisode Homer vs. Lisa and the 8th Commandment Homer vs. Lisa and the 8th Commandment 0 1991 \N 23 Animation,Comedy tt0767445 tvEpisode Principal Charming Principal Charming 0 1991 \N 23 Animation,Comedy tt0768553 tvEpisode Bart the Lover Bart the Lover 0 1992 \N 30 Animation,Comedy tt0768554 tvEpisode Dog of Death Dog of Death 0 1992 \N 30 Animation,Comedy tt0768555 tvEpisode Homer Alone Homer Alone 0 1992 \N 30 Animation,Comedy tt0768556 tvEpisode I Married Marge I Married Marge 0 1991 \N 30 Animation,Comedy tt0768557 tvEpisode Lisa the Greek Lisa the Greek 0 1992 \N 30 Animation,Comedy tt0768558 tvEpisode Saturdays of Thunder Saturdays of Thunder 0 1991 \N 30 Animation,Comedy tt0769743 tvEpisode Burns Verkaufen der Kraftwerk Burns Verkaufen der Kraftwerk 0 1991 \N 30 Animation,Comedy ================================================ FILE: data/test/small/title.episode.tsv ================================================ tconst parentTconst seasonNumber episodeNumber tt0348034 tt0096697 1 1 tt0701059 tt0096697 1 5 tt0701060 tt0096697 3 4 tt0701062 tt0096697 2 7 tt0701063 tt0096697 2 16 tt0701064 tt0096697 3 23 tt0701070 tt0096697 3 21 tt0701076 tt0096697 3 24 tt0701077 tt0096697 2 18 tt0701082 tt0096697 3 20 tt0701084 tt0096697 2 5 tt0701098 tt0096697 3 10 tt0701110 tt0096697 3 5 tt0701114 tt0096697 3 17 tt0701123 tt0096697 1 10 tt0701124 tt0096697 1 3 tt0701140 tt0096697 2 9 tt0701147 tt0096697 1 12 tt0701152 tt0096697 1 9 tt0701153 tt0096697 3 6 tt0701161 tt0096697 3 8 tt0701164 tt0096697 2 19 tt0701178 tt0096697 1 6 tt0701183 tt0096697 3 2 tt0701191 tt0096697 2 15 tt0701192 tt0096697 2 17 tt0701195 tt0096697 2 11 tt0701200 tt0096697 3 13 tt0701204 tt0096697 3 18 tt0701211 tt0096697 2 2 tt0701215 tt0096697 1 13 tt0701217 tt0096697 3 1 tt0701228 tt0096697 1 7 tt0701232 tt0096697 1 11 tt0701254 tt0096697 3 22 tt0701269 tt0096697 2 12 tt0701275 tt0096697 2 21 tt0701278 tt0096697 2 3 tt0756398 tt0096697 1 8 tt0756399 tt0096697 1 4 tt0756593 tt0096697 1 2 tt0757017 tt0096697 2 10 tt0757023 tt0096697 2 4 tt0759267 tt0096697 3 7 tt0763024 tt0096697 2 1 tt0763042 tt0096697 3 3 tt0766140 tt0096697 2 20 tt0767438 tt0096697 2 8 tt0767440 tt0096697 2 22 tt0767442 tt0096697 2 6 tt0767443 tt0096697 2 13 tt0767445 tt0096697 2 14 tt0768553 tt0096697 3 16 tt0768554 tt0096697 3 19 tt0768555 tt0096697 3 15 tt0768556 tt0096697 3 12 tt0768557 tt0096697 3 14 tt0768558 tt0096697 3 9 tt0769743 tt0096697 3 11 ================================================ FILE: data/test/small/title.ratings.tsv ================================================ tconst averageRating numVotes tt0000001 5.8 1356 tt0000002 6.5 157 tt0000003 6.6 939 tt0000004 6.4 93 tt0000005 6.2 1630 tt0000006 5.6 79 tt0000007 5.5 546 tt0000008 5.6 1454 tt0000009 5.4 62 tt0000010 6.9 4880 tt0000011 5.4 193 tt0000012 7.4 8102 tt0000013 5.7 1239 tt0000014 7.2 3542 tt0000015 6.2 606 tt0000016 5.9 922 tt0000017 4.8 181 tt0000018 5.5 389 tt0000019 6.7 12 tt0000020 5.1 219 tt0000022 5.1 703 tt0000023 5.7 875 tt0000024 5.8 18 tt0000025 5.0 14 tt0000026 5.7 1086 ================================================ FILE: imdb-eval/COPYING ================================================ This project is dual-licensed under the Unlicense and MIT licenses. You may use this code under the terms of either license. ================================================ FILE: imdb-eval/Cargo.toml ================================================ [package] name = "imdb-eval" version = "0.1.2" authors = ["Andrew Gallant "] description = """ A command line utility for evaluating the IMDb name index. """ documentation = "https://github.com/BurntSushi/imdb-rename" homepage = "https://github.com/BurntSushi/imdb-rename" repository = "https://github.com/BurntSushi/imdb-rename" readme = "README.md" keywords = ["imdb", "index", "search", "name", "evaluation"] license = "Unlicense/MIT" edition = "2021" [dependencies] anyhow = "1.0.75" clap = { version = "2.34.0", default-features = false } csv = "1.3.0" imdb-index = { version = "0.1.4", path = "../imdb-index" } lazy_static = "1.4.0" log = { version = "0.4.20", features = ["std"] } serde = { version = "1.0.193", features = ["derive"] } toml = "0.8.8" ================================================ FILE: imdb-eval/LICENSE-MIT ================================================ The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: imdb-eval/README.md ================================================ imdb-eval ========= A command line tool for evaluating imdb-rename's search functionality. [![Linux build status](https://api.travis-ci.org/BurntSushi/imdb-rename.png)](https://travis-ci.org/BurntSushi/imdb-rename) [![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/imdb-rename?svg=true)](https://ci.appveyor.com/project/BurntSushi/imdb-rename) [![](http://meritbadge.herokuapp.com/imdb-rename)](https://crates.io/crates/imdb-rename) ### Installation No release binaries are provided for imdb-eval. Instead, users should compile it from source: ``` $ git clone https://github.com/BurntSushi/imdb-rename $ cd imdb-rename $ cargo build --release --all $ ./target/release/imdb-eval --help ``` For more details on how to use imdb-eval, please see imdb-rename's README. ================================================ FILE: imdb-eval/UNLICENSE ================================================ This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to ================================================ FILE: imdb-eval/src/eval.rs ================================================ use std::collections::BTreeMap; use std::fmt; use std::fs::File; use std::io::Read; use std::path::{Path, PathBuf}; use std::time::{Duration, Instant}; use std::vec; use imdb_index::{ Index, IndexBuilder, MediaEntity, NameScorer, NgramType, Query, Searcher, Similarity, }; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; /// The default truth data used in an evaluation. It's small enough that we /// embed it directly into the binary. const TRUTH_DATA: &str = include_str!("../../data/eval/truth.toml"); lazy_static! { /// A structured representation of the default truth data. static ref TRUTH: Truth = toml::from_str(TRUTH_DATA).unwrap(); } /// The truth data for our evaluation. /// /// The truth data consists of a set of information needs that we call "tasks." #[derive(Clone, Debug, Deserialize)] struct Truth { #[serde(rename = "task")] tasks: Vec, } /// A task or "information need" defined by the truth data. Each task /// corresponds to a query that we feed to the name index, and each task has a /// single correct answer. #[derive(Clone, Debug, Deserialize)] struct Task { query: String, answer: String, } impl Truth { /// Load truth data from the given TOML file. fn from_path>(path: P) -> anyhow::Result { let path = path.as_ref(); let mut contents = String::new(); File::open(path)?.read_to_string(&mut contents)?; Ok(toml::from_str(&contents)?) } } /// A specification for running an evaluation. Fundamentally, a specification /// describes the thing we want to evaluate, where the thing we want to /// evaluate is a specific configuration of how we build *and* search an IMDb /// index. /// /// A specification describes both how the index should be built and how /// queries should be generated. Specifications with equivalent index settings /// may reuse the same on-disk index. For example, the ngram size and type are /// index settings, but the similarity function, name scorer and result size /// are all query time settings. /// /// A specification cannot itself produce a complete query. Namely, a /// specification requires an information need (called a "task") to construct /// a query specific to that need. The results of that query are then compared /// with that information need's answer to determine the score, which is, /// invariably, a reflection of how well the configuration given by this /// specification performs. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Spec { result_size: usize, ngram_size: usize, ngram_type: NgramType, sim: Similarity, scorer: Option, } impl Spec { /// Create a new spec using a default configuration. pub fn new() -> Spec { Spec { result_size: 30, ngram_size: 3, ngram_type: NgramType::default(), sim: Similarity::None, scorer: Some(NameScorer::OkapiBM25), } } /// Set the result size for this specification. /// /// This returns an error if the given size is less than `1`. pub fn with_result_size( mut self, result_size: usize, ) -> anyhow::Result { if result_size < 1 { anyhow::bail!( "result size {} is invalid, must be greater than 0", result_size ); } self.result_size = result_size; Ok(self) } /// Set the ngram size for this specification. /// /// This returns an error if the given size is less than `2`. pub fn with_ngram_size( mut self, ngram_size: usize, ) -> anyhow::Result { if ngram_size < 2 { anyhow::bail!( "ngram size {} is invalid, must be greater than 1", ngram_size, ); } self.ngram_size = ngram_size; Ok(self) } /// Set the ngram type for this specification. pub fn with_ngram_type(mut self, ngram_type: NgramType) -> Spec { self.ngram_type = ngram_type; self } /// Set the similarity ranker function for this specification. pub fn with_similarity(mut self, sim: Similarity) -> Spec { self.sim = sim; self } /// Set the name scorer for this specification. /// /// Note that if the given scorer is `None`, then an evaluation will likely /// be quite slow, since each information need will result in an exhaustive /// search of the corpus. pub fn with_scorer(mut self, scorer: Option) -> Spec { self.scorer = scorer; self } /// Evaluate this specification against the built-in truth data. pub fn evaluate, P2: AsRef>( &self, data_dir: P1, eval_dir: P2, ) -> anyhow::Result { let searcher = Searcher::new(self.index(data_dir, eval_dir)?); Ok(Evaluation { evaluator: Evaluator { spec: self, searcher }, tasks: TRUTH.clone().tasks.into_iter(), }) } /// Evaluate this specification against a set of truth data at the given /// file path. pub fn evaluate_with, P2: AsRef, P3: AsRef>( &self, data_dir: P1, eval_dir: P2, truth_path: P3, ) -> anyhow::Result { let searcher = Searcher::new(self.index(data_dir, eval_dir)?); Ok(Evaluation { evaluator: Evaluator { spec: self, searcher }, tasks: Truth::from_path(truth_path)?.tasks.into_iter(), }) } /// Create a query derived from this specification and a particular /// information need or "task." fn query(&self, task: &Task) -> Query { Query::new() .name(&task.query) .name_scorer(self.scorer.clone()) .similarity(self.sim.clone()) .size(self.result_size) } /// Either open or create an index suitable for this specification. /// /// If no index exists in the expected sub-directory of `eval_dir`, then /// a new index is created. fn index, P2: AsRef>( &self, data_dir: P1, eval_dir: P2, ) -> anyhow::Result { let index_dir = self.index_dir(eval_dir.as_ref()); Ok(if index_dir.exists() { Index::open(data_dir, index_dir)? } else { IndexBuilder::new() .ngram_size(self.ngram_size) .ngram_type(self.ngram_type) .create(data_dir, index_dir)? }) } /// The sub-directory of `eval_dir` in which to store this specification's /// index. fn index_dir>(&self, eval_dir: P) -> PathBuf { eval_dir.as_ref().join(self.index_name()) } /// The expected name of the index for this evaluation specification. /// /// The name of the index is derived specifically from this specification's /// index-time settings, such as the ngram size. This permits multiple /// distinct specifications to reuse the same index. fn index_name(&self) -> String { format!("ngram-{}_ngram-type-{}", self.ngram_size, self.ngram_type) } } impl Default for Spec { fn default() -> Spec { Spec::new() } } impl fmt::Display for Spec { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let scorer = match self.scorer { None => "none".to_string(), Some(ref scorer) => scorer.to_string(), }; write!( f, "size-{}_ngram-{}_ngram-type-{}_sim-{}_scorer-{}", self.result_size, self.ngram_size, self.ngram_type, self.sim, scorer, ) } } /// A summary of the results of evaluating every information need or "task" for /// a single evaluation specification. The summary boils the quality of the /// specification down to two figures: the mean reciprocal rank and the ratio /// of tasks that produced an answer. /// /// The mean reciprocal rank measures the average precision of the /// specification. That is, it measures how well we answer the following /// question: "If your search produced the correct answer, how highly was it /// ranked?" /// /// The ratio of tasks that produced an answer measures how well we answer the /// following question: "Of the searches ran, how many of them produced the /// correct result at any rank?" /// /// Implicit in the evaluation is the notion of a bounded number of results. /// That is, every specification dictates the maximum number of results /// returned by a search. If the answer isn't in that result set, then we stop /// there and declare that the answer wasn't found. /// /// The reason for using two different scores is so that they counter balance /// each other. Namely, a specification that does really well on a smaller /// number of results might end up with a higher MRR than other specifications, /// but will have a lower ratio of successful searches. #[derive(Debug, Deserialize, Serialize)] pub struct Summary { /// The specification name that this result is summarizing. pub name: String, /// Mean reciprocal rank. pub mrr: f64, /// The ratio of tasks that found an answer. The higher the better. pub found: f64, } impl Summary { /// Returns a group of summaries for all distinct specifications found /// in the back of results given. /// /// If no results are given, then no summaries are returned. pub fn from_task_results(results: &[TaskResult]) -> Vec { let mut grouped: BTreeMap<&str, Vec<&TaskResult>> = BTreeMap::new(); for result in results { grouped.entry(&result.name).or_insert(vec![]).push(result); } let mut summaries = vec![]; for results in grouped.values() { summaries.push(Summary::from_same_task_results(results)); } summaries } /// Returns a summary for a single group of task results. All the results /// given must have the same name, otherwise this panics. This also panics /// if the given results are empty. fn from_same_task_results(results: &[&TaskResult]) -> Summary { assert!(!results.is_empty()); assert!(results.iter().all(|r| results[0].name == r.name)); let mut precision_sum = 0.0; let mut found = 0u64; for r in results { precision_sum += r.rank.map_or(0.0, |rank| 1.0 / (rank as f64)); if r.rank.is_some() { found += 1; } } Summary { name: results[0].name.clone(), mrr: precision_sum / (results.len() as f64), found: (found as f64) / (results.len() as f64), } } } /// The result of evaluating a single information need or "task." #[derive(Debug, Deserialize, Serialize)] pub struct TaskResult { /// The name of the evaluation's spec. This name includes all of the /// parameters that influence the evaluation, such as ngram size, /// similarity function, etc. pub name: String, /// The freeform text query, which represents a specific manifestation of /// this information need. Generally speaking, this corresponds to the /// query that an end user will type. pub query: String, /// The IMDb identifier corresponding to a singular answer expected by an /// end user. pub answer: String, /// If the answer appears in the search results, then this corresponds to /// the rank of that search result. The rank is determined by the answer's /// absolute position in the list of ranked search results. /// /// Ties in the ranked list are handled by assigning the maximum possible /// rank to each search result with the same score. For example, if we /// request 30 results and the answer is incidentally 10th in the list but /// every search result has the same score of 1.0, then the rank of our /// answer is 30. (Indeed, the rank of every search result is 30 in this /// example.) pub rank: Option, /// The time it took to execute this query, in seconds. pub duration_seconds: f64, } /// An evaluation is an iterator over all of the results of evaluating every /// information need in the truth data. #[derive(Debug)] pub struct Evaluation<'s> { /// The evaluator, which turns an information need into a `TaskResult`. evaluator: Evaluator<'s>, /// All of the tasks to evaluate. tasks: vec::IntoIter, } impl<'s> Iterator for Evaluation<'s> { type Item = anyhow::Result; fn next(&mut self) -> Option> { self.tasks.next().map(|task| self.evaluator.run(&task)) } } /// An evaluator is responsible for executing a single search for a single /// information need. It records the evaluation of that search result in a /// `TaskResult`. #[derive(Debug)] struct Evaluator<'s> { /// The evaluation specification. spec: &'s Spec, /// A handle to a searcher for an IMDb index. searcher: Searcher, } impl<'s> Evaluator<'s> { /// Run this evaluator on a single information need and return the /// evaluation. fn run(&mut self, task: &Task) -> anyhow::Result { let start = Instant::now(); let rank = self.rank(task)?; let duration = Instant::now().duration_since(start); Ok(TaskResult { name: self.spec.to_string(), query: task.query.clone(), answer: task.answer.clone(), rank, duration_seconds: fractional_seconds(&duration), }) } /// Execute the search for the given information need and determine the /// rank of the expected answer for the given information need. If the /// expected answer didn't appear in the search results, then `None` is /// returned. /// /// The rank of the answer is determined in exactly the way you might /// expect: if the answer appears as the Nth result in a search, then its /// rank is N. There is one tricky part of this, and it is specifically in /// how we break ties. Stated succinctly, we always take the maximum /// possible rank of a result. For example, given the following results, /// where the first column is the score, the second column is the /// result name, and the third column is the *intuitive* rank: /// /// 1.0 a 1 /// 1.0 b 1 /// 1.0 c 1 /// 0.9 d 4 /// 0.8 e 5 /// 0.8 f 5 /// 0.7 g 7 /// /// Namely, records that are tied all get assigned the same rank, and the /// next result with a lower score is assigned a rank equivalent to its /// absolute position in the result list. /// /// The problem with this ranking strategy is that it biases toward rankers /// that have a naive score. In particular, so long as a search returns the /// answer in the results, it could assign a score of `1.0` to every /// result and get a maximal RR (Reciprocal Rank) evaluation. /// /// Instead, we invert how results are ranked. The above example is instead /// ranked like so: /// /// 1.0 a 3 /// 1.0 b 3 /// 1.0 c 3 /// 0.9 d 4 /// 0.8 e 6 /// 0.8 f 6 /// 0.7 g 7 /// /// In other words, we assign the maximal possible rank instead of the /// minimal possible rank. /// /// There are other strategies, but in general, we want to reward high /// precision rankers. fn rank(&mut self, task: &Task) -> anyhow::Result> { let results = self.searcher.search(&self.spec.query(&task))?; let mut rank = results.len() as u64; let mut prev_score = None; let mut ranked: Vec<(u64, MediaEntity)> = vec![]; for (i, scored) in results.into_iter().enumerate().rev() { let (score, entity) = scored.into_pair(); if prev_score.map_or(true, |s| !approx_eq(s, score)) { rank = i as u64 + 1; prev_score = Some(score); } ranked.push((rank, entity)); } ranked.reverse(); for (rank, entity) in ranked { if entity.title().id == task.answer { return Ok(Some(rank)); } } Ok(None) } } /// Compares two floating point numbers for equality approximately for some /// epsilon. fn approx_eq(x1: f64, x2: f64) -> bool { // We used a fixed error because it's good enough in practice. (x1 - x2).abs() <= 0.0000000001 } /// Returns the number of seconds in this duration in fraction form. /// The number to the left of the decimal point is the number of seconds, /// and the number to the right is the number of milliseconds. fn fractional_seconds(d: &Duration) -> f64 { let fractional = (d.subsec_nanos() as f64) / 1_000_000_000.0; d.as_secs() as f64 + fractional } #[cfg(test)] mod tests { use imdb_index::{NameScorer, NgramType, Similarity}; use super::Spec; #[test] fn spec_printer() { let spec = Spec { result_size: 30, ngram_size: 3, ngram_type: NgramType::Window, sim: Similarity::None, scorer: Some(NameScorer::OkapiBM25), }; let expected = "size-30_ngram-3_ngram-type-window_sim-none_scorer-okapibm25"; assert_eq!(spec.to_string(), expected); let spec = Spec { result_size: 1, ngram_size: 2, ngram_type: NgramType::Edge, sim: Similarity::Jaro, scorer: None, }; let expected = "size-1_ngram-2_ngram-type-edge_sim-jaro_scorer-none"; assert_eq!(spec.to_string(), expected); } } ================================================ FILE: imdb-eval/src/logger.rs ================================================ // This module defines a super simple logger that works with the `log` crate. // We don't need anything fancy; just basic log levels and the ability to // print to stderr. We therefore avoid bringing in extra dependencies just // for this functionality. use log::Log; use anyhow::Result; /// Initialize a simple logger. pub fn init() -> Result<()> { Ok(Logger::init()?) } /// The simplest possible logger that logs to stderr. /// /// This logger does no filtering. Instead, it relies on the `log` crates /// filtering via its global max_level setting. #[derive(Debug)] struct Logger(()); const LOGGER: &'static Logger = &Logger(()); impl Logger { /// Create a new logger that logs to stderr and initialize it as the /// global logger. If there was a problem setting the logger, then an /// error is returned. fn init() -> std::result::Result<(), log::SetLoggerError> { log::set_logger(LOGGER) } } impl Log for Logger { fn enabled(&self, _: &log::Metadata) -> bool { // We set the log level via log::set_max_level, so we don't need to // implement filtering here. true } fn log(&self, record: &log::Record) { if !should_log(record) { return; } eprintln!("{}: {}", record.level(), record.args()); } fn flush(&self) { // We use eprintln! which is flushed on every call. } } fn should_log(record: &log::Record) -> bool { let t = record.target(); t.starts_with("imdb_rename") || t.starts_with("imdb_index") } ================================================ FILE: imdb-eval/src/main.rs ================================================ use std::env; use std::io; use std::path::{Path, PathBuf}; use std::process; use std::result; use std::str::FromStr; use imdb_index::{NameScorer, NgramType, Similarity}; use lazy_static::lazy_static; use crate::eval::Spec; mod eval; mod logger; fn main() { if let Err(err) = try_main() { // A pipe error occurs when the consumer of this process's output has // hung up. This is a normal event, and we should quit gracefully. if is_pipe_error(&err) { process::exit(0); } eprintln!("{:?}", err); process::exit(1); } } fn try_main() -> anyhow::Result<()> { logger::init()?; log::set_max_level(log::LevelFilter::Info); let args = Args::from_matches(&app().get_matches())?; if args.debug { log::set_max_level(log::LevelFilter::Debug); } if let Some(ref summarize) = args.summarize { return run_summarize(summarize); } else if args.dry_run { for spec in args.specs()? { println!("{}", spec); } return Ok(()); } run_eval( &args.data_dir, &args.eval_dir, args.truth.as_ref().map(|p| p.as_path()), args.specs()?, ) } /// Run an evaluation on the IMDb data in `data_dir`, and store any indexes /// created for the evaluation in `eval_dir`. If a path to truth data is given, /// then the information needs or "tasks" used for the evaluation are taken /// from that file, otherwise, a built-in truth data set is used. /// /// The specs given each describe the protocol for an evaluation. They each /// represent a configuration for how an IMDb index is built and how queries /// are constructed. The specification is fundamentally the thing we want to /// evaluate. That is, we want to find the "best" specification. fn run_eval( data_dir: &Path, eval_dir: &Path, truth_path: Option<&Path>, specs: Vec, ) -> anyhow::Result<()> { if !data_dir.exists() { anyhow::bail!( "data directory {} does not exist; please use \ imdb-rename to create it", data_dir.display() ); } let mut wtr = csv::Writer::from_writer(io::stdout()); for spec in &specs { let results = match truth_path { None => spec.evaluate(data_dir, eval_dir)?, Some(p) => spec.evaluate_with(data_dir, eval_dir, p)?, }; for result in results { wtr.serialize(result?)?; wtr.flush()?; } } Ok(()) } /// Summarize the evaluation results at the given path. fn run_summarize(summarize: &Path) -> anyhow::Result<()> { let mut results: Vec = vec![]; let mut rdr = csv::Reader::from_path(summarize)?; for result in rdr.deserialize() { results.push(result?); } let mut wtr = csv::Writer::from_writer(io::stdout()); for summary in eval::Summary::from_task_results(&results) { wtr.serialize(summary)?; } wtr.flush()?; Ok(()) } #[derive(Debug)] struct Args { data_dir: PathBuf, debug: bool, dry_run: bool, eval_dir: PathBuf, ngram_sizes: Vec, ngram_types: Vec, result_sizes: Vec, scorers: Vec>, similarities: Vec, summarize: Option, truth: Option, } impl Args { /// Build a structured set of arguments from clap's matches. fn from_matches(matches: &clap::ArgMatches) -> anyhow::Result { let data_dir = matches.value_of_os("data-dir").map(PathBuf::from).unwrap(); let eval_dir = matches.value_of_os("eval-dir").map(PathBuf::from).unwrap(); let similarities = parse_many_lossy( matches, "sim", vec![ Similarity::None, Similarity::Levenshtein, Similarity::Jaro, Similarity::JaroWinkler, ], )?; let scorers = parse_many_lossy( matches, "scorer", vec![ OptionalNameScorer::from(NameScorer::OkapiBM25), OptionalNameScorer::from(NameScorer::TFIDF), OptionalNameScorer::from(NameScorer::Jaccard), OptionalNameScorer::from(NameScorer::QueryRatio), ], )? .into_iter() .map(|s| s.0) .collect(); let ngram_types = parse_many_lossy(matches, "ngram-type", vec![NgramType::Window])?; Ok(Args { data_dir, debug: matches.is_present("debug"), dry_run: matches.is_present("dry-run"), eval_dir, ngram_sizes: parse_many_lossy(matches, "ngram-size", vec![3])?, ngram_types, result_sizes: parse_many_lossy(matches, "result-size", vec![30])?, scorers, similarities, summarize: matches.value_of_os("summarize").map(PathBuf::from), truth: matches.value_of_os("truth").map(PathBuf::from), }) } /// Build all evaluation specifications as indicated by command line /// options. fn specs(&self) -> anyhow::Result> { // We want to build all possible permutations. We do this by // alternating between specs1 and specs2. Each additional parameter // combinatorially explodes the previous set of specifications. let (mut specs1, mut specs2) = (vec![], vec![]); for &ngram_size in &self.ngram_sizes { specs1.push(Spec::new().with_ngram_size(ngram_size)?); } for spec in specs1.drain(..) { for &result_size in &self.result_sizes { specs2.push(spec.clone().with_result_size(result_size)?); } } for spec in specs2.drain(..) { for sim in &self.similarities { specs1.push(spec.clone().with_similarity(sim.clone())); } } for spec in specs1.drain(..) { for scorer in &self.scorers { specs2.push(spec.clone().with_scorer(scorer.clone())); } } for spec in specs2.drain(..) { for ngram_type in &self.ngram_types { specs1.push(spec.clone().with_ngram_type(ngram_type.clone())); } } Ok(specs1) } } fn app() -> clap::App<'static, 'static> { use clap::{App, AppSettings, Arg}; lazy_static! { // clap wants all of its strings tied to a particular lifetime, but // we'd really like to determine some default values dynamically. Using // a lazy_static here is one way of safely giving a static lifetime to // a value that is computed at runtime. // // An alternative approach would be to compute all of our default // values in the caller, and pass them into this function. It's nicer // to defined what we need here though. Locality of reference and all // that. static ref DEFAULT_DATA_DIR: PathBuf = env::temp_dir().join("imdb-rename"); static ref DEFAULT_EVAL_DIR: PathBuf = env::temp_dir().join("imdb-rename-eval"); static ref POSSIBLE_SCORER_NAMES: Vec<&'static str> = { let mut names = NameScorer::possible_names().to_vec(); names.insert(0, "none"); names }; } App::new("imdb-rename") .author(clap::crate_authors!()) .version(clap::crate_version!()) .max_term_width(100) .setting(AppSettings::UnifiedHelpMessage) .arg(Arg::with_name("data-dir") .long("data-dir") .env("IMDB_RENAME_DATA_DIR") .takes_value(true) .default_value_os(DEFAULT_DATA_DIR.as_os_str()) .help("The location to store IMDb data files.")) .arg(Arg::with_name("debug") .long("debug") .help("Show debug messages. Use this when filing bugs.")) .arg(Arg::with_name("dry-run") .long("dry-run") .help("Show the evaluations that would be run and then exit \ without running them.")) .arg(Arg::with_name("eval-dir") .long("eval-dir") .env("IMDB_RENAME_EVAL_DIR") .takes_value(true) .default_value_os(DEFAULT_EVAL_DIR.as_os_str()) .help("The location to store evaluation index files.")) .arg(Arg::with_name("ngram-size") .long("ngram-size") .takes_value(true) .multiple(true) .number_of_values(1) .help("Set the ngram size on which to perform an evaluation. \ An evaluation will be performed for each ngram size. \ If no ngram size is given, a default of 3 is used.")) .arg(Arg::with_name("ngram-type") .long("ngram-type") .takes_value(true) .multiple(true) .number_of_values(1) .possible_values(NgramType::possible_names()) .help("Set the ngram type on which to perform an evaluation. \ An evaluation will be performed for each ngram type. \ If no ngram type is given, it defaults to 'window'.")) .arg(Arg::with_name("result-size") .long("result-size") .takes_value(true) .multiple(true) .number_of_values(1) .help("Set the result size on which to perform an evaluation. \ An evaluation will be performed for each result size. \ If no result size is given, a default of 30 is used.")) .arg(Arg::with_name("scorer") .long("scorer") .takes_value(true) .multiple(true) .number_of_values(1) .possible_values(&POSSIBLE_SCORER_NAMES) .help("Set the name scorer function to use. An evaluation is \ performed for each name function given. By default, \ all name scorers are used, except for 'none'.")) .arg(Arg::with_name("sim") .long("sim") .takes_value(true) .multiple(true) .number_of_values(1) .possible_values(Similarity::possible_names()) .help("Set the similarity ranker function to use. An evaluation \ is performed for each ranker function given. By default, \ all ranker functions are used, including 'none'.")) .arg(Arg::with_name("summarize") .long("summarize") .takes_value(true) .number_of_values(1) .help("Print summary statistics from an evaluation run.")) .arg(Arg::with_name("truth") .long("truth") .takes_value(true) .help("A file path containing evaluation truth data. By default, \ an evaluation uses truth data embedded in imdb-rename.")) } /// An optional name scorer is a `NameScorer` that may be absent. /// /// We define a type for it to make parsing it easier. #[derive(Debug)] struct OptionalNameScorer(Option); impl FromStr for OptionalNameScorer { type Err = imdb_index::Error; fn from_str( s: &str, ) -> result::Result { let opt = if s == "none" { None } else { Some(s.parse()?) }; Ok(OptionalNameScorer(opt)) } } impl From for OptionalNameScorer { fn from(scorer: NameScorer) -> OptionalNameScorer { OptionalNameScorer(Some(scorer)) } } /// Parse a sequence of values from clap. fn parse_many_lossy< E: std::error::Error + Send + Sync + 'static, T: FromStr, >( matches: &clap::ArgMatches, name: &str, default: Vec, ) -> anyhow::Result> { let strs = match matches.values_of_lossy(name) { None => return Ok(default), Some(strs) => strs, }; let mut values = vec![]; for s in strs { values.push(s.parse()?); } Ok(values) } /// Return true if and only if an I/O broken pipe error exists in the causal /// chain of the given error. fn is_pipe_error(err: &anyhow::Error) -> bool { for cause in err.chain() { if let Some(ioerr) = cause.downcast_ref::() { if ioerr.kind() == io::ErrorKind::BrokenPipe { return true; } } } false } ================================================ FILE: imdb-index/COPYING ================================================ This project is dual-licensed under the Unlicense and MIT licenses. You may use this code under the terms of either license. ================================================ FILE: imdb-index/Cargo.toml ================================================ [package] name = "imdb-index" version = "0.1.4" #:version authors = ["Andrew Gallant "] description = """ A library for indexing and searching IMDb using information retrieval. """ documentation = "https://github.com/BurntSushi/imdb-rename" homepage = "https://github.com/BurntSushi/imdb-rename" repository = "https://github.com/BurntSushi/imdb-rename" readme = "README.md" keywords = ["imdb", "movie", "index", "search"] license = "Unlicense/MIT" edition = "2021" [dependencies] csv = "1.3.0" fnv = "1.0.7" fst = "0.4.7" lazy_static = "1.4.0" log = { version = "0.4.20", features = ["std"] } memmap = { package = "memmap2", version = "0.9.1" } regex = "1.10.2" serde = { version = "1.0.193", features = ["derive"] } serde_json = "1.0.108" strsim = "0.10.0" ================================================ FILE: imdb-index/LICENSE-MIT ================================================ The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: imdb-index/README.md ================================================ imdb-index ========== A library for reading and writing an IMDb index, with a focus on IMDb titles. In particular, this library can build a name index on all of IMDb's 6 million names, which supports fast fuzzy searching and relevance ranking. [![Linux build status](https://api.travis-ci.org/BurntSushi/imdb-rename.png)](https://travis-ci.org/BurntSushi/imdb-rename) [![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/imdb-rename?svg=true)](https://ci.appveyor.com/project/BurntSushi/imdb-rename) [![](http://meritbadge.herokuapp.com/imdb-rename)](https://crates.io/crates/imdb-index) Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). ### Documentation https://docs.rs/imdb-index ================================================ FILE: imdb-index/UNLICENSE ================================================ This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to ================================================ FILE: imdb-index/src/error.rs ================================================ use std::fmt; use std::path::{Path, PathBuf}; /// A type alias for handling errors throughout imdb-index. pub type Result = std::result::Result; /// An error that can occur while interacting with an IMDb index. #[derive(Debug)] pub struct Error { kind: ErrorKind, } impl Error { /// Return a reference to the kind of this error. pub fn kind(&self) -> &ErrorKind { &self.kind } /// Transfer ownership of the kind of this error. pub fn into_kind(self) -> ErrorKind { self.kind } pub(crate) fn new(kind: ErrorKind) -> Error { Error { kind } } pub(crate) fn unknown_title>(unk: T) -> Error { Error { kind: ErrorKind::UnknownTitle(unk.as_ref().to_string()) } } pub(crate) fn unknown_scorer>(unk: T) -> Error { Error { kind: ErrorKind::UnknownScorer(unk.as_ref().to_string()) } } pub(crate) fn unknown_ngram_type>(unk: T) -> Error { Error { kind: ErrorKind::UnknownNgramType(unk.as_ref().to_string()) } } pub(crate) fn unknown_sim>(unk: T) -> Error { Error { kind: ErrorKind::UnknownSimilarity(unk.as_ref().to_string()) } } pub(crate) fn unknown_directive>(unk: T) -> Error { Error { kind: ErrorKind::UnknownDirective(unk.as_ref().to_string()) } } pub(crate) fn bug>(msg: T) -> Error { Error { kind: ErrorKind::Bug(msg.as_ref().to_string()) } } pub(crate) fn config>(msg: T) -> Error { Error { kind: ErrorKind::Config(msg.as_ref().to_string()) } } pub(crate) fn version(expected: u64, got: u64) -> Error { Error { kind: ErrorKind::VersionMismatch { expected, got } } } pub(crate) fn csv(err: csv::Error) -> Error { Error { kind: ErrorKind::Csv(err.to_string()) } } pub(crate) fn fst(err: fst::Error) -> Error { Error { kind: ErrorKind::Fst(err.to_string()) } } pub(crate) fn io(err: std::io::Error) -> Error { Error { kind: ErrorKind::Io { err, path: None } } } pub(crate) fn io_path>( err: std::io::Error, path: P, ) -> Error { Error { kind: ErrorKind::Io { err, path: Some(path.as_ref().to_path_buf()), }, } } pub(crate) fn number( err: E, ) -> Error { Error { kind: ErrorKind::Number(Box::new(err)) } } } impl std::error::Error for Error { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self.kind { ErrorKind::Io { ref err, .. } => Some(err), ErrorKind::Number(ref err) => Some(&**err), _ => None, } } } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { self.kind.fmt(f) } } /// The specific kind of error that can occur. #[derive(Debug)] pub enum ErrorKind { /// An index version mismatch. This error occurs when the version of the /// index is different from the version supported by this version of /// imdb-index. /// /// Generally speaking, the versions must be exactly equivalent, otherwise /// this error is returned. VersionMismatch { /// The expected or supported index version. expected: u64, /// The actual version of the index on disk. got: u64, }, /// An error parsing the type of a title. /// /// The data provided is the unrecognized title type. UnknownTitle(String), /// An error parsing the name of a scorer. /// /// The data provided is the unrecognized name. UnknownScorer(String), /// An error parsing the name of an ngram type. /// /// The data provided is the unrecognized name. UnknownNgramType(String), /// An error parsing the name of a similarity function. /// /// The data provided is the unrecognized name. UnknownSimilarity(String), /// An error parsing the name of a directive from a free-form query. /// /// The data provided is the unrecognized name. UnknownDirective(String), /// An unexpected error occurred while reading an index that should not /// have occurred. Generally, these errors correspond to bugs in this /// library. Bug(String), /// An error occurred while reading/writing the index config. Config(String), /// An error that occured while writing or reading CSV data. Csv(String), /// An error that occured while creating an FST index. Fst(String), /// An unexpected I/O error occurred. Io { /// The underlying I/O error. err: std::io::Error, /// A file path, if the I/O error occurred in the context of a named /// file. path: Option, }, /// An error occurred while parsing a number in a free-form query. Number(Box), /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients /// don't count on exhaustive matching. (Otherwise, adding a new variant /// could break existing code.) #[doc(hidden)] __Nonexhaustive, } impl fmt::Display for ErrorKind { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { ErrorKind::VersionMismatch { expected, got } => write!( f, "index version mismatch: expected version {} \ but got version {}. Please rebuild the index.", expected, got ), ErrorKind::UnknownTitle(ref unk) => { write!(f, "unrecognized title type: '{}'", unk) } ErrorKind::UnknownScorer(ref unk) => { write!(f, "unrecognized scorer name: '{}'", unk) } ErrorKind::UnknownNgramType(ref unk) => { write!(f, "unrecognized ngram type: '{}'", unk) } ErrorKind::UnknownSimilarity(ref unk) => { write!(f, "unrecognized similarity function: '{}'", unk) } ErrorKind::UnknownDirective(ref unk) => { write!(f, "unrecognized search directive: '{}'", unk) } ErrorKind::Bug(ref msg) => { let report = "Please report this bug with a backtrace at \ https://github.com/BurntSushi/imdb-rename"; write!(f, "BUG: {}\n{}", msg, report) } ErrorKind::Config(ref msg) => write!(f, "config error: {}", msg), ErrorKind::Csv(ref msg) => write!(f, "{}", msg), ErrorKind::Fst(ref msg) => write!(f, "fst error: {}", msg), ErrorKind::Io { path: None, .. } => write!(f, "I/O error"), ErrorKind::Io { path: Some(ref p), .. } => { write!(f, "{}", p.display()) } ErrorKind::Number(_) => write!(f, "error parsing number"), ErrorKind::__Nonexhaustive => panic!("invalid error"), } } } ================================================ FILE: imdb-index/src/index/aka.rs ================================================ use std::io; use std::iter; use std::path::Path; use memmap::Mmap; use crate::error::{Error, Result}; use crate::index::{csv_file, csv_mmap, id}; use crate::record::AKA; use crate::util::IMDB_AKAS; /// A name of the AKA record index file. /// /// This index represents a map from IMDb title id to a 64-bit integer. The /// 64-bit integer encodes two pieces of information: the number of alternate /// names for the title (high 16 bits) and the file offset at which the records /// appear in title.akas.tsv (low 48 bits). const AKAS: &str = "akas.fst"; /// A handle to the AKA name index. /// /// The AKA index maps IMDb identifiers to a list of AKA records. /// /// This index assumes that the underlying AKA CSV file is sorted by IMDb ID. #[derive(Debug)] pub struct Index { akas: csv::Reader>, idx: id::IndexReader, } impl Index { /// Open an AKA index using the corresponding data and index directories. /// The data directory contains the IMDb data set while the index directory /// contains the index data files. pub fn open, P2: AsRef>( data_dir: P1, index_dir: P2, ) -> Result { Ok(Index { // We claim it is safe to open the following memory map because we // don't mutate them and no other process (should) either. akas: unsafe { csv_mmap(data_dir.as_ref().join(IMDB_AKAS))? }, idx: id::IndexReader::from_path(index_dir.as_ref().join(AKAS))?, }) } /// Create an AKA index by reading the AKA data from the given data /// directory and writing the index to the corresponding index directory. pub fn create, P2: AsRef>( data_dir: P1, index_dir: P2, ) -> Result { let data_dir = data_dir.as_ref(); let index_dir = index_dir.as_ref(); let rdr = csv_file(data_dir.join(IMDB_AKAS))?; let mut wtr = id::IndexSortedWriter::from_path(index_dir.join(AKAS))?; let mut count = 0u64; for result in AKAIndexRecords::new(rdr) { let record = result?; wtr.insert(&record.id, (record.count << 48) | record.offset)?; count += record.count; } wtr.finish()?; log::info!("{} alternate names indexed", count); Index::open(data_dir, index_dir) } /// Return a (possibly empty) iterator over all AKA records for the given /// IMDb ID. pub fn find(&mut self, id: &[u8]) -> Result { match self.idx.get(id) { None => Ok(AKARecordIter(None)), Some(v) => { let count = (v >> 48) as usize; let offset = v & ((1 << 48) - 1); let mut pos = csv::Position::new(); pos.set_byte(offset); self.akas.seek(pos).map_err(Error::csv)?; Ok(AKARecordIter(Some(self.akas.deserialize().take(count)))) } } } } /// An iterator over AKA records for a single IMDb title. /// /// This iterator is constructed via the `aka::Index::find` method. /// /// This iterator may yield no titles. /// /// The lifetime `'r` refers to the lifetime of the underlying AKA index /// reader. pub struct AKARecordIter<'r>( Option, AKA>>>, ); impl<'r> Iterator for AKARecordIter<'r> { type Item = Result; fn next(&mut self) -> Option> { let next = match self.0.as_mut().and_then(|it| it.next()) { None => return None, Some(next) => next, }; match next { Ok(next) => Some(Ok(next)), Err(err) => Some(Err(Error::csv(err))), } } } /// An indexable AKA record. /// /// Each indexable record represents a group of alternative titles in the /// title.akas.tsv file. #[derive(Clone, Debug, Eq, PartialEq)] struct AKAIndexRecord { id: Vec, offset: u64, count: u64, } /// A streaming iterator over indexable AKA records. /// /// Each indexable record is a triple, and consists of an IMDb title ID, /// the number of alternate titles for that title, and the file offset in the /// CSV file at which those records begin. /// /// The `R` type parameter refers to the underlying `io::Read` type of the /// CSV reader. #[derive(Debug)] struct AKAIndexRecords { /// The underlying CSV reader. rdr: csv::Reader, /// Scratch space for storing the byte record. record: csv::ByteRecord, /// Set to true when the iterator has been exhausted. done: bool, } impl AKAIndexRecords { /// Create a new streaming iterator over indexable AKA records. fn new(rdr: csv::Reader) -> AKAIndexRecords { AKAIndexRecords { rdr, record: csv::ByteRecord::new(), done: false } } } impl Iterator for AKAIndexRecords { type Item = Result; /// Advance to the next indexable record and return it. If no more /// records exist, return `None`. /// /// If there was a problem parsing or reading from the underlying CSV /// data, then an error is returned. fn next(&mut self) -> Option> { macro_rules! itry { ($e:expr) => { match $e { Err(err) => return Some(Err(Error::csv(err))), Ok(v) => v, } }; } if self.done { return None; } // Only initialize the record if this is our first go at it. // Otherwise, previous call leaves next record in `AKAIndexRecord`. if self.record.is_empty() { if !itry!(self.rdr.read_byte_record(&mut self.record)) { return None; } } let mut irecord = AKAIndexRecord { id: self.record[0].to_vec(), offset: self.record.position().expect("position on row").byte(), count: 1, }; while itry!(self.rdr.read_byte_record(&mut self.record)) { if irecord.id != &self.record[0] { break; } irecord.count += 1; } // If we've read the last record then we're done! if self.rdr.is_done() { self.done = true; } Some(Ok(irecord)) } } #[cfg(test)] mod tests { use super::*; use crate::util::csv_reader_builder; #[test] fn aka_index_records1() { let data = r"titleId ordering title region language types attributes isOriginalTitle tt0117019 1 Hommes à l'huile FR \N \N \N 0 tt0117019 2 Männer in Öl DE \N \N \N 0 tt0117019 3 Men in Oil XEU en festival \N 0 tt0117019 4 Männer in Öl: Annäherungsversuche an die Malerin Susanne Hay \N \N original \N 1 tt0117019 5 Men in Oil XWW en \N \N 0 tt0117020 1 Mendigos sin fronteras ES \N \N \N 0 tt0117021 1 Menno's Mind US \N \N \N 0 tt0117021 2 Menno's Mind \N \N original \N 1 tt0117021 3 The Matrix 2 RU \N video \N 0 tt0117021 4 Virtuális elme HU \N imdbDisplay \N 0 tt0117021 5 Power.com US \N video \N 0 tt0117021 6 La mente de Menno ES \N \N \N 0 tt0117021 7 Power.com CA en video \N 0 tt0117021 8 Terror im Computer DE \N \N \N 0 tt0117022 1 Menopause Song CA \N \N \N 0 tt0117023 1 Les menteurs FR \N \N \N 0"; let rdr = csv_reader_builder().from_reader(data.as_bytes()); let records: Vec = AKAIndexRecords::new(rdr).collect::>().unwrap(); assert_eq!(records.len(), 5); assert_eq!(records[0].id, b"tt0117019"); assert_eq!(records[0].count, 5); assert_eq!(records[1].id, b"tt0117020"); assert_eq!(records[1].count, 1); assert_eq!(records[2].id, b"tt0117021"); assert_eq!(records[2].count, 8); assert_eq!(records[3].id, b"tt0117022"); assert_eq!(records[3].count, 1); assert_eq!(records[4].id, b"tt0117023"); assert_eq!(records[4].count, 1); } #[test] fn aka_index_records2() { let data = r"titleId ordering title region language types attributes isOriginalTitle tt0117019 1 Hommes à l'huile FR \N \N \N 0 tt0117019 2 Männer in Öl DE \N \N \N 0 tt0117019 3 Men in Oil XEU en festival \N 0 tt0117019 4 Männer in Öl: Annäherungsversuche an die Malerin Susanne Hay \N \N original \N 1 tt0117019 5 Men in Oil XWW en \N \N 0 tt0117020 1 Mendigos sin fronteras ES \N \N \N 0 tt0117021 1 Menno's Mind US \N \N \N 0 tt0117021 2 Menno's Mind \N \N original \N 1 tt0117021 3 The Matrix 2 RU \N video \N 0 tt0117021 4 Virtuális elme HU \N imdbDisplay \N 0 tt0117021 5 Power.com US \N video \N 0 tt0117021 6 La mente de Menno ES \N \N \N 0 tt0117021 7 Power.com CA en video \N 0 tt0117021 8 Terror im Computer DE \N \N \N 0"; let rdr = csv_reader_builder().from_reader(data.as_bytes()); let records: Vec = AKAIndexRecords::new(rdr).collect::>().unwrap(); assert_eq!(records.len(), 3); assert_eq!(records[0].id, b"tt0117019"); assert_eq!(records[0].count, 5); assert_eq!(records[1].id, b"tt0117020"); assert_eq!(records[1].count, 1); assert_eq!(records[2].id, b"tt0117021"); assert_eq!(records[2].count, 8); } #[test] fn aka_index_records3() { let data = r"titleId ordering title region language types attributes isOriginalTitle tt0117021 1 Menno's Mind US \N \N \N 0 tt0117021 2 Menno's Mind \N \N original \N 1 tt0117021 3 The Matrix 2 RU \N video \N 0 tt0117021 4 Virtuális elme HU \N imdbDisplay \N 0 tt0117021 5 Power.com US \N video \N 0 tt0117021 6 La mente de Menno ES \N \N \N 0 tt0117021 7 Power.com CA en video \N 0 tt0117021 8 Terror im Computer DE \N \N \N 0"; let rdr = csv_reader_builder().from_reader(data.as_bytes()); let records: Vec = AKAIndexRecords::new(rdr).collect::>().unwrap(); assert_eq!(records.len(), 1); assert_eq!(records[0].id, b"tt0117021"); assert_eq!(records[0].count, 8); } #[test] fn aka_index_records4() { let data = r"titleId ordering title region language types attributes isOriginalTitle tt0117021 1 Menno's Mind US \N \N \N 0"; let rdr = csv_reader_builder().from_reader(data.as_bytes()); let records: Vec = AKAIndexRecords::new(rdr).collect::>().unwrap(); assert_eq!(records.len(), 1); assert_eq!(records[0].id, b"tt0117021"); assert_eq!(records[0].count, 1); } } ================================================ FILE: imdb-index/src/index/episode.rs ================================================ use std::cmp; use std::path::Path; use std::u32; use fst::{IntoStreamer, Streamer}; use memmap::Mmap; use crate::error::{Error, Result}; use crate::index::csv_file; use crate::record::Episode; use crate::util::{fst_set_builder_file, fst_set_file, IMDB_EPISODE}; /// The name of the episode index file. /// /// The episode index maps TV show ids to episodes. The index is constructed /// in a way where either of the following things can be used as look up keys: /// /// tvshow IMDb title ID /// (tvshow IMDb title ID, season number) /// /// In particular, the index itself stores the entire episode record, and it /// can be re-constituted without re-visiting the original episode data file. const SEASONS: &str = "episode.seasons.fst"; /// The name of the TV show index file. /// /// The TV show index maps episode IMDb title IDs to tvshow IMDb title IDs. /// This allows us to quickly look up the TV show corresponding to an episode /// in search results. /// /// The format of this index is an FST set, where each key corresponds to the /// episode ID joined with the TV show ID by a `NUL` byte. This lets us do /// a range query on the set when given the episode ID to find the TV show ID. const TVSHOWS: &str = "episode.tvshows.fst"; /// An episode index that supports retrieving season and episode information /// quickly. #[derive(Debug)] pub struct Index { seasons: fst::Set, tvshows: fst::Set, } impl Index { /// Open an episode index from the given index directory. pub fn open>(index_dir: P) -> Result { let index_dir = index_dir.as_ref(); // We claim it is safe to open the following memory map because we // don't mutate them and no other process (should) either. let seasons = unsafe { fst_set_file(index_dir.join(SEASONS))? }; let tvshows = unsafe { fst_set_file(index_dir.join(TVSHOWS))? }; Ok(Index { seasons, tvshows }) } /// Create an episode index from the given IMDb data directory and write /// it to the given index directory. If an episode index already exists, /// then it is overwritten. pub fn create, P2: AsRef>( data_dir: P1, index_dir: P2, ) -> Result { let data_dir = data_dir.as_ref(); let index_dir = index_dir.as_ref(); let mut buf = vec![]; let mut seasons = fst_set_builder_file(index_dir.join(SEASONS))?; let mut tvshows = fst_set_builder_file(index_dir.join(TVSHOWS))?; let mut episodes = read_sorted_episodes(data_dir)?; for episode in &episodes { buf.clear(); write_episode(episode, &mut buf)?; seasons.insert(&buf).map_err(Error::fst)?; } episodes.sort_by(|e1, e2| { (&e1.id, &e1.tvshow_id).cmp(&(&e2.id, &e2.tvshow_id)) }); for episode in &episodes { buf.clear(); write_tvshow(&episode, &mut buf)?; tvshows.insert(&buf).map_err(Error::fst)?; } seasons.finish().map_err(Error::fst)?; tvshows.finish().map_err(Error::fst)?; log::info!("{} episodes indexed", episodes.len()); Index::open(index_dir) } /// Return a sequence of episodes for the given TV show IMDb identifier. /// /// The episodes are sorted in order of season number and episode number. /// Episodes without season/episode numbers are sorted after episodes with /// numbers. pub fn seasons(&self, tvshow_id: &[u8]) -> Result> { let mut upper = tvshow_id.to_vec(); upper.push(0xFF); let mut episodes = vec![]; let mut stream = self.seasons.range().ge(tvshow_id).le(upper).into_stream(); while let Some(episode_bytes) = stream.next() { episodes.push(read_episode(episode_bytes)?); } Ok(episodes) } /// Return a sequence of episodes for the given TV show IMDb identifier and /// season number. /// /// The episodes are sorted in order of episode number. Episodes without /// episode numbers are sorted after episodes with numbers. pub fn episodes( &self, tvshow_id: &[u8], season: u32, ) -> Result> { let mut lower = tvshow_id.to_vec(); lower.push(0x00); lower.extend_from_slice(&season.to_be_bytes()); lower.extend_from_slice(&0u32.to_be_bytes()); let mut upper = tvshow_id.to_vec(); upper.push(0x00); upper.extend_from_slice(&season.to_be_bytes()); upper.extend_from_slice(&u32::MAX.to_be_bytes()); let mut episodes = vec![]; let mut stream = self.seasons.range().ge(lower).le(upper).into_stream(); while let Some(episode_bytes) = stream.next() { episodes.push(read_episode(episode_bytes)?); } Ok(episodes) } /// Return the episode information for the given episode IMDb identifier. /// /// If no episode information for the given ID exists, then `None` is /// returned. pub fn episode(&self, episode_id: &[u8]) -> Result> { let mut upper = episode_id.to_vec(); upper.push(0xFF); let mut stream = self.tvshows.range().ge(episode_id).le(upper).into_stream(); while let Some(tvshow_bytes) = stream.next() { return Ok(Some(read_tvshow(tvshow_bytes)?)); } Ok(None) } } fn read_sorted_episodes(data_dir: &Path) -> Result> { // We claim it is safe to open the following memory map because we don't // mutate them and no other process (should) either. let mut rdr = csv_file(data_dir.join(IMDB_EPISODE))?; let mut records = vec![]; for result in rdr.deserialize() { let record: Episode = result.map_err(Error::csv)?; records.push(record); } records.sort_by(cmp_episode); Ok(records) } fn cmp_episode(ep1: &Episode, ep2: &Episode) -> cmp::Ordering { let k1 = ( &ep1.tvshow_id, ep1.season.unwrap_or(u32::MAX), ep1.episode.unwrap_or(u32::MAX), &ep1.id, ); let k2 = ( &ep2.tvshow_id, ep2.season.unwrap_or(u32::MAX), ep2.episode.unwrap_or(u32::MAX), &ep2.id, ); k1.cmp(&k2) } fn read_episode(bytes: &[u8]) -> Result { let nul = match bytes.iter().position(|&b| b == 0) { Some(nul) => nul, None => bug!("could not find nul byte"), }; let tvshow_id = match String::from_utf8(bytes[..nul].to_vec()) { Err(err) => bug!("tvshow_id invalid UTF-8: {}", err), Ok(tvshow_id) => tvshow_id, }; let mut i = nul + 1; let season = from_optional_u32("season", &bytes[i..])?; i += 4; let episode = from_optional_u32("episode number", &bytes[i..])?; i += 4; let id = match String::from_utf8(bytes[i..].to_vec()) { Err(err) => bug!("episode id invalid UTF-8: {}", err), Ok(id) => id, }; Ok(Episode { id, tvshow_id, season, episode }) } fn write_episode(ep: &Episode, buf: &mut Vec) -> Result<()> { if ep.tvshow_id.as_bytes().iter().any(|&b| b == 0) { bug!("unsupported tvshow id (with NUL byte) for {:?}", ep); } buf.extend_from_slice(ep.tvshow_id.as_bytes()); buf.push(0x00); buf.extend_from_slice(&to_optional_season(ep)?.to_be_bytes()); buf.extend_from_slice(&to_optional_epnum(ep)?.to_be_bytes()); buf.extend_from_slice(ep.id.as_bytes()); Ok(()) } fn read_tvshow(bytes: &[u8]) -> Result { let nul = match bytes.iter().position(|&b| b == 0) { Some(nul) => nul, None => bug!("could not find nul byte"), }; let id = match String::from_utf8(bytes[..nul].to_vec()) { Err(err) => bug!("episode id invalid UTF-8: {}", err), Ok(tvshow_id) => tvshow_id, }; let mut i = nul + 1; let season = from_optional_u32("season", &bytes[i..])?; i += 4; let episode = from_optional_u32("episode number", &bytes[i..])?; i += 4; let tvshow_id = match String::from_utf8(bytes[i..].to_vec()) { Err(err) => bug!("tvshow_id invalid UTF-8: {}", err), Ok(tvshow_id) => tvshow_id, }; Ok(Episode { id, tvshow_id, season, episode }) } fn write_tvshow(ep: &Episode, buf: &mut Vec) -> Result<()> { if ep.id.as_bytes().iter().any(|&b| b == 0) { bug!("unsupported episode id (with NUL byte) for {:?}", ep); } buf.extend_from_slice(ep.id.as_bytes()); buf.push(0x00); buf.extend_from_slice(&to_optional_season(ep)?.to_be_bytes()); buf.extend_from_slice(&to_optional_epnum(ep)?.to_be_bytes()); buf.extend_from_slice(ep.tvshow_id.as_bytes()); Ok(()) } fn from_optional_u32( label: &'static str, bytes: &[u8], ) -> Result> { if bytes.len() < 4 { bug!("not enough bytes to read optional {}", label); } Ok(match u32::from_be_bytes(bytes[..4].try_into().unwrap()) { u32::MAX => None, x => Some(x), }) } fn to_optional_season(ep: &Episode) -> Result { match ep.season { None => Ok(u32::MAX), Some(x) => { if x == u32::MAX { bug!("unsupported season number {} for {:?}", x, ep); } Ok(x) } } } fn to_optional_epnum(ep: &Episode) -> Result { match ep.episode { None => Ok(u32::MAX), Some(x) => { if x == u32::MAX { bug!("unsupported episode number {} for {:?}", x, ep); } Ok(x) } } } #[cfg(test)] mod tests { use super::Index; use crate::index::tests::TestContext; use std::collections::HashMap; #[test] fn basics() { let ctx = TestContext::new("small"); let idx = Index::create(ctx.data_dir(), ctx.index_dir()).unwrap(); let eps = idx.seasons(b"tt0096697").unwrap(); let mut counts: HashMap = HashMap::new(); for ep in eps { *counts.entry(ep.season.unwrap()).or_insert(0) += 1; } assert_eq!(counts.len(), 3); assert_eq!(counts[&1], 13); assert_eq!(counts[&2], 22); assert_eq!(counts[&3], 24); } #[test] fn by_season() { let ctx = TestContext::new("small"); let idx = Index::create(ctx.data_dir(), ctx.index_dir()).unwrap(); let eps = idx.episodes(b"tt0096697", 2).unwrap(); let mut counts: HashMap = HashMap::new(); for ep in eps { *counts.entry(ep.season.unwrap()).or_insert(0) += 1; } println!("{:?}", counts); assert_eq!(counts.len(), 1); assert_eq!(counts[&2], 22); } #[test] fn tvshow() { let ctx = TestContext::new("small"); let idx = Index::create(ctx.data_dir(), ctx.index_dir()).unwrap(); let ep = idx.episode(b"tt0701063").unwrap().unwrap(); assert_eq!(ep.tvshow_id, "tt0096697"); } } ================================================ FILE: imdb-index/src/index/id.rs ================================================ use std::fs::File; use std::io; use std::path::Path; use memmap::Mmap; use crate::error::{Error, Result}; use crate::util::{fst_map_builder_file, fst_map_file}; /// An index that maps arbitrary length identifiers to 64-bit integers. /// /// An ID index is often useful for mapping human readable identifiers or /// "natural keys" to other more convenient forms, such as file offsets. #[derive(Debug)] pub struct IndexReader { idx: fst::Map, } impl IndexReader { /// Open's an ID index reader from the given file path. pub fn from_path>(path: P) -> Result { // We claim it is safe to open the following memory map because we // don't mutate them and no other process (should) either. Ok(IndexReader { idx: unsafe { fst_map_file(path)? } }) } /// Return the integer associated with the given ID, if it exists. pub fn get(&self, key: &[u8]) -> Option { self.idx.get(key) } } /// An ID index writer that requires that identifiers are given in /// lexicographically ascending order. pub struct IndexSortedWriter { wtr: fst::MapBuilder, } impl IndexSortedWriter> { /// Create an index writer that writes the index to the given file path. pub fn from_path>( path: P, ) -> Result>> { Ok(IndexSortedWriter { wtr: fst_map_builder_file(path)? }) } } impl IndexSortedWriter { /// Associate the given identifier with the given integer. /// /// If the given key is not strictly lexicographically greater than the /// previous key, then an error is returned. pub fn insert(&mut self, key: &[u8], value: u64) -> Result<()> { self.wtr.insert(key, value).map_err(Error::fst)?; Ok(()) } /// Finish writing the index. /// /// This must be called, otherwise the index will likely be unreadable. pub fn finish(self) -> Result<()> { self.wtr.finish().map_err(Error::fst)?; Ok(()) } } ================================================ FILE: imdb-index/src/index/mod.rs ================================================ use std::fs; use std::io; use std::path::{Path, PathBuf}; use std::thread; use std::time::Instant; use memmap::Mmap; use serde::{Deserialize, Serialize}; use crate::error::{Error, Result}; use crate::record::{Episode, Rating, Title, TitleKind}; use crate::scored::SearchResults; use crate::util::{ create_file, csv_file, csv_mmap, open_file, NiceDuration, IMDB_BASICS, }; pub use self::aka::AKARecordIter; pub use self::names::{NameQuery, NameScorer, NgramType}; mod aka; mod episode; mod id; mod names; mod rating; #[cfg(test)] mod tests; mod writer; /// The version of the index format on disk. /// /// Generally speaking, if the version of the index on disk doesn't exactly /// match the version expected by this code, then the index won't be read. /// The caller must then re-generate the index. /// /// This version represents all indexing structures on disk in this module. const VERSION: u64 = 1; /// The name of the title file index. /// /// This index represents a map from the IMDb title ID to the file offset /// corresponding to that record in title.basics.tsv. const TITLE: &str = "title.fst"; /// The name of the file containing the index configuration. /// /// The index configuration is a JSON file with some meta data about this /// index, such as its version. const CONFIG: &str = "config.json"; /// A media entity is a title with optional episode and rating records. /// /// A media entity makes it convenient to deal with the complete information /// of an IMDb media record. This is the default value returned by search /// routines such as what the [`Searcher`](struct.Searcher.html) provides, and /// can also be cheaply constructed by an [`Index`](struct.Index.html) given a /// [`Title`](struct.Title.html) or an IMDb ID. #[derive(Clone, Debug)] pub struct MediaEntity { title: Title, episode: Option, rating: Option, } impl MediaEntity { /// Return a reference to the underlying `Title`. pub fn title(&self) -> &Title { &self.title } /// Return a reference to the underlying `Episode`, if it exists. pub fn episode(&self) -> Option<&Episode> { self.episode.as_ref() } /// Return a reference to the underlying `Rating`, if it exists. pub fn rating(&self) -> Option<&Rating> { self.rating.as_ref() } } /// An index into IMDb titles and their associated data. /// /// This index consists of a set of on disk index data structures in addition /// to the uncompressed IMDb `tsv` files. The on disk index structures are used /// to provide access to the records in the `tsv` files efficiently. /// /// With this index, one can do the following things: /// /// * Return a ranked list /// [`Title`](struct.Title.html) /// records matching a fuzzy name query. /// * Access any `Title` record by ID in constant time. /// * Access all /// [`AKA`](struct.AKA.html) /// records for any `Title` in constant time. /// * Access the /// [`Rating`](struct.Rating.html) /// for any `Title` in constant time. /// * Access the complete set of /// [`Episode`](struct.Episode.html) /// records for any TV show in constant time. /// * Access the specific `Episode` given its ID in constant time. #[derive(Debug)] pub struct Index { /// The directory containing the IMDb tsv files. data_dir: PathBuf, /// The directory containing this crate's index structures. index_dir: PathBuf, /// A seekable reader for `title.basics.tsv`. The index structures /// typically return offsets that can be used to seek this reader to the /// beginning of any `Title` record. csv_basic: csv::Reader>, /// The name index. This is what provides fuzzy queries. idx_names: names::IndexReader, /// The AKA index. idx_aka: aka::Index, /// The episode index. idx_episode: episode::Index, /// The rating index. idx_rating: rating::Index, /// The title index. idx_title: id::IndexReader, } #[derive(Debug, Deserialize, Serialize)] struct Config { version: u64, } impl Index { /// Open an existing index using default settings. If the index does not /// exist, or if there was a problem opening it, then this returns an /// error. /// /// Generally, this method is cheap to call. It opens some file /// descriptors, but otherwise does no work. /// /// `data_dir` should be the directory containing decompressed IMDb /// `tsv` files. See: https://www.imdb.com/interfaces/ /// /// `index_dir` should be the directory containing a previously created /// index using `Index::create`. pub fn open, P2: AsRef>( data_dir: P1, index_dir: P2, ) -> Result { IndexBuilder::new().open(data_dir, index_dir) } /// Create a new index using default settings. /// /// Calling this method is expensive, and one should expect this to take /// dozens of seconds or more to complete. /// /// `data_dir` should be the directory containing decompressed IMDb tsv` /// `files. See: https://www.imdb.com/interfaces/ /// /// `index_dir` should be the directory containing a previously created /// index using `Index::create`. /// /// This will overwrite any previous index that may have existed in /// `index_dir`. pub fn create, P2: AsRef>( data_dir: P1, index_dir: P2, ) -> Result { IndexBuilder::new().create(data_dir, index_dir) } /// Attempt to clone this index, returning a distinct `Index`. /// /// This is as cheap to call as `Index::open` and returns an error if there /// was a problem reading the underlying index. /// /// This is useful when one wants to query the same `Index` on disk from /// multiple threads. pub fn try_clone(&self) -> Result { Index::open(&self.data_dir, &self.index_dir) } /// Search this index for `Title` records whose name matches the given /// query. /// /// The query controls the following things: /// /// * The name to search for. /// * The maximum number of results returned. /// * The scorer to use to rank results. /// /// The name can be any string. It is normalized and broken down into /// component pieces, which are then used to quickly search all existing /// titles quickly and fuzzily. /// /// This returns an error if there was a problem reading the index or the /// underlying CSV data. pub fn search( &mut self, query: &names::NameQuery, ) -> Result> { let mut results = SearchResults::new(); // The name index gives us back scores with offsets. The offset can be // used to seek our `Title` CSV reader to the corresponding record and // read it in constant time. for result in self.idx_names.search(query) { let title = match self.read_record(*result.value())? { None => continue, Some(title) => title, }; results.push(result.map(|_| title)); } Ok(results) } /// Returns the `MediaEntity` for the given IMDb ID. /// /// An entity includes an [`Episode`](struct.Episode.html) and /// [`Rating`](struct.Rating.html) records if they exist for the title. /// /// This returns an error if there was a problem reading the underlying /// index. If no such title exists for the given ID, then `None` is /// returned. pub fn entity(&mut self, id: &str) -> Result> { match self.title(id)? { None => Ok(None), Some(title) => self.entity_from_title(title).map(Some), } } /// Returns the `MediaEntity` for the given `Title`. /// /// This is like the `entity` method, except it takes a `Title` record as /// given. pub fn entity_from_title(&mut self, title: Title) -> Result { let episode = match title.kind { TitleKind::TVEpisode => self.episode(&title.id)?, _ => None, }; let rating = self.rating(&title.id)?; Ok(MediaEntity { title, episode, rating }) } /// Returns the `Title` record for the given IMDb ID. /// /// This returns an error if there was a problem reading the underlying /// index. If no such title exists for the given ID, then `None` is /// returned. pub fn title(&mut self, id: &str) -> Result> { match self.idx_title.get(id.as_bytes()) { None => Ok(None), Some(offset) => self.read_record(offset), } } /// Returns an iterator over all `AKA` records for the given IMDb ID. /// /// If no AKA records exist for the given ID, then an empty iterator is /// returned. /// /// If there was a problem reading the index, then an error is returned. pub fn aka_records(&mut self, id: &str) -> Result { self.idx_aka.find(id.as_bytes()) } /// Returns the `Rating` associated with the given IMDb ID. /// /// If no rating exists for the given ID, then this returns `None`. /// /// If there was a problem reading the index, then an error is returned. pub fn rating(&mut self, id: &str) -> Result> { self.idx_rating.rating(id.as_bytes()) } /// Returns all of the episodes for the given TV show. The TV show should /// be identified by its IMDb ID. /// /// If the given ID isn't a TV show or if the TV show doesn't have any /// episodes, then an empty list is returned. /// /// The episodes returned are sorted in order of their season and episode /// numbers. Episodes without a season or episode number are sorted after /// episodes with a season or episode number. /// /// If there was a problem reading the index, then an error is returned. pub fn seasons(&mut self, tvshow_id: &str) -> Result> { self.idx_episode.seasons(tvshow_id.as_bytes()) } /// Returns all of the episodes for the given TV show and season number. /// The TV show should be identified by its IMDb ID, and the season should /// be identified by its number. (Season numbers generally start at `1`.) /// /// If the given ID isn't a TV show or if the TV show doesn't have any /// episodes for the given season, then an empty list is returned. /// /// The episodes returned are sorted in order of their episode numbers. /// Episodes without an episode number are sorted after episodes with an /// episode number. /// /// If there was a problem reading the index, then an error is returned. pub fn episodes( &mut self, tvshow_id: &str, season: u32, ) -> Result> { self.idx_episode.episodes(tvshow_id.as_bytes(), season) } /// Return the episode corresponding to the given IMDb ID. /// /// If the ID doesn't correspond to an episode, then `None` is returned. /// /// If there was a problem reading the index, then an error is returned. pub fn episode(&mut self, episode_id: &str) -> Result> { self.idx_episode.episode(episode_id.as_bytes()) } /// Returns the data directory that this index returns results for. pub fn data_dir(&self) -> &Path { &self.data_dir } /// Returns the directory containing this index's files. pub fn index_dir(&self) -> &Path { &self.index_dir } /// Read the CSV `Title` record beginning at the given file offset. /// /// If no such record exists, then this returns `None`. /// /// If there was a problem reading the underlying CSV data, then an error /// is returned. /// /// If the given offset does not point to the start of a record in the CSV /// data, then the behavior of this method is unspecified. fn read_record(&mut self, offset: u64) -> Result> { let mut pos = csv::Position::new(); pos.set_byte(offset); self.csv_basic.seek(pos).map_err(Error::csv)?; let mut record = csv::StringRecord::new(); if !self.csv_basic.read_record(&mut record).map_err(Error::csv)? { Ok(None) } else { let headers = self.csv_basic.headers().map_err(Error::csv)?; Ok(record.deserialize(Some(headers)).map_err(Error::csv)?) } } } /// A builder for opening or creating an `Index`. #[derive(Debug)] pub struct IndexBuilder { ngram_type: NgramType, ngram_size: usize, } impl IndexBuilder { /// Create a new builder with a default configuration. pub fn new() -> IndexBuilder { IndexBuilder { ngram_type: NgramType::default(), ngram_size: 3 } } /// Use the current configuration to open an existing index. If the index /// does not exist, or if there was a problem opening it, then this returns /// an error. /// /// Generally, this method is cheap to call. It opens some file /// descriptors, but otherwise does no work. /// /// `data_dir` should be the directory containing decompressed IMDb tsv` /// `files. See: https://www.imdb.com/interfaces/ /// /// `index_dir` should be the directory containing a previously created /// index using `Index::create`. /// /// Note that settings for index creation are ignored. pub fn open, P2: AsRef>( &self, data_dir: P1, index_dir: P2, ) -> Result { let data_dir = data_dir.as_ref(); let index_dir = index_dir.as_ref(); log::debug!("opening index {}", index_dir.display()); let config_file = open_file(index_dir.join(CONFIG))?; let config: Config = serde_json::from_reader(config_file) .map_err(|e| Error::config(e.to_string()))?; if config.version != VERSION { return Err(Error::version(VERSION, config.version)); } Ok(Index { data_dir: data_dir.to_path_buf(), index_dir: index_dir.to_path_buf(), // We claim it is safe to open the following memory map because we // don't mutate them and no other process (should) either. csv_basic: unsafe { csv_mmap(data_dir.join(IMDB_BASICS))? }, idx_names: names::IndexReader::open(index_dir)?, idx_aka: aka::Index::open(data_dir, index_dir)?, idx_episode: episode::Index::open(index_dir)?, idx_rating: rating::Index::open(index_dir)?, idx_title: id::IndexReader::from_path(index_dir.join(TITLE))?, }) } /// Use the current configuration to create a new index. /// /// Calling this method is expensive, and one should expect this to take /// dozens of seconds or more to complete. /// /// `data_dir` should be the directory containing decompressed IMDb tsv` /// `files. See: https://www.imdb.com/interfaces/ /// /// `index_dir` should be the directory containing a previously created /// index using `Index::create`. /// /// This will overwrite any previous index that may have existed in /// `index_dir`. pub fn create, P2: AsRef>( &self, data_dir: P1, index_dir: P2, ) -> Result { let data_dir = data_dir.as_ref(); let index_dir = index_dir.as_ref(); fs::create_dir_all(index_dir) .map_err(|e| Error::io_path(e, index_dir))?; log::info!("creating index at {}", index_dir.display()); // Creating the rating and episode indices are completely independent // from the name/AKA indexes, so do them in a background thread. The // episode index takes long enough to build to justify this. let job = { let data_dir = data_dir.to_path_buf(); let index_dir = index_dir.to_path_buf(); thread::spawn(move || -> Result<()> { let start = Instant::now(); rating::Index::create(&data_dir, &index_dir)?; log::info!( "created rating index (took {})", NiceDuration::since(start) ); let start = Instant::now(); episode::Index::create(&data_dir, &index_dir)?; log::info!( "created episode index (took {})", NiceDuration::since(start) ); Ok(()) }) }; let start = Instant::now(); let mut aka_index = aka::Index::create(data_dir, index_dir)?; log::info!("created AKA index (took {})", NiceDuration::since(start)); let start = Instant::now(); create_name_index( &mut aka_index, data_dir, index_dir, self.ngram_type, self.ngram_size, )?; log::info!( "created name index, ngram type: {}, ngram size: {} (took {})", self.ngram_type, self.ngram_size, NiceDuration::since(start) ); job.join().unwrap()?; // Write out our config. let config_file = create_file(index_dir.join(CONFIG))?; serde_json::to_writer_pretty( config_file, &Config { version: VERSION }, ) .map_err(|e| Error::config(e.to_string()))?; self.open(data_dir, index_dir) } /// Set the type of ngram generation to use. /// /// The default type is `Window`. pub fn ngram_type(&mut self, ngram_type: NgramType) -> &mut IndexBuilder { self.ngram_type = ngram_type; self } /// Set the ngram size on this index. /// /// When creating an index, ngrams with this size will be used. pub fn ngram_size(&mut self, ngram_size: usize) -> &mut IndexBuilder { self.ngram_size = ngram_size; self } } impl Default for IndexBuilder { fn default() -> IndexBuilder { IndexBuilder::new() } } /// Creates the name index from the title tsv data and an AKA index. The AKA /// index is used to index additional names for each title record to improve /// recall during search. /// /// To avoid a second pass through the title records, this also creates the /// title ID index, which provides an index for looking up a `Title` by its /// ID in constant time. fn create_name_index( aka_index: &mut aka::Index, data_dir: &Path, index_dir: &Path, ngram_type: NgramType, ngram_size: usize, ) -> Result<()> { // For logging. let (mut count, mut title_count) = (0u64, 0u64); let mut wtr = names::IndexWriter::open(index_dir, ngram_type, ngram_size)?; let mut twtr = id::IndexSortedWriter::from_path(index_dir.join(TITLE))?; let mut rdr = csv_file(data_dir.join(IMDB_BASICS))?; let mut record = csv::StringRecord::new(); while rdr.read_record(&mut record).map_err(Error::csv)? { let pos = record.position().expect("position on row"); let id = &record[0]; let title = &record[2]; let original_title = &record[3]; let is_adult = &record[4] == "1"; if is_adult { // TODO: Expose an option to permit this. continue; } count += 1; title_count += 1; twtr.insert(id.as_bytes(), pos.byte())?; // Index the primary name. wtr.insert(pos.byte(), title)?; if title != original_title { // Index the "original" name. wtr.insert(pos.byte(), original_title)?; count += 1; } // Now index all of the alternate names, if they exist. for result in aka_index.find(id.as_bytes())? { let akarecord = result?; if title != akarecord.title { wtr.insert(pos.byte(), &akarecord.title)?; count += 1; } } } wtr.finish()?; twtr.finish()?; log::info!("{} titles indexed", title_count); log::info!("{} total names indexed", count); Ok(()) } ================================================ FILE: imdb-index/src/index/names.rs ================================================ use std::cmp; use std::collections::{binary_heap, BinaryHeap}; use std::fmt; use std::fs::File; use std::io::{self, Write}; use std::path::Path; use std::str::{self, FromStr}; use std::time::Instant; use fnv::FnvHashMap; use memmap::Mmap; use serde::{Deserialize, Serialize}; use crate::error::{Error, Result}; use crate::index::writer::CursorWriter; use crate::scored::{Scored, SearchResults}; use crate::util::{ fst_map_builder_file, fst_map_file, mmap_file, open_file, NiceDuration, }; /// The name of the file containing the index configuration. /// /// The index configuration is a JSON file with some meta data about this /// index, such as its version, ngram size and aggregate statistics about the /// corpus that has been indexed. const CONFIG: &str = "names.config.json"; /// The name of the ngram term index. /// /// The ngram term index maps ngrams (fixed size sequences of Unicode /// codepoints) to file offsets. Each file offset points to the postings for /// the corresponding term. const NGRAM: &str = "names.ngram.fst"; /// The name of the postings list index. /// /// The postings list contains an entry for every term in the ngram index. /// Each entry corresponds to a list of document/frequency pairs. Namely, each /// entry is a DocID and a frequency count indicating how many times the /// corresponding term appeared in that document. Each entry in the list is /// encoded as a single 32 little-endian integer. The high 4 bits represent /// the frequency (which is capped at 15, a reasonable number for indexing /// short name strings) while the low 28 bits represent the doc id. The /// `MAX_DOC_ID` constant below ensures we make sure to never use a doc id /// that won't fit this encoding scheme. /// /// The last eight bytes in the postings index contains a 64-bit little-endian /// encoded integer indicating the average length of all documents represented /// by the ngram index. The length is recorded in units of terms, which /// generally correspond to the total number of ngrams in a name. const POSTINGS: &str = "names.postings.idx"; /// The name of the identifier map index. /// /// This file maps `DocID`s to `NameID`s. It consists of a sequence of /// 64-bit little-endian encoded integers, where the length of the sequence /// corresponds to the total number of names in the index. Each entry in the /// sequence encodes a `NameID`. In other words, the index to this sequence is /// a `DocID` and the value at that index is a `NameID`. /// /// The id map is used to map doc ids returned by the postings to name ids /// which were provided by the caller. This also permits search to deduplicate /// results. That is, we should never return multiple results for the same /// NameID, even though we may have indexed multiple names for the same name /// id. const IDMAP: &str = "names.idmap.idx"; /// The name of the document length index. /// /// This file consists of a sequence of 16-bit little-endian encoded /// integers, where the length of the sequence corresponds to the total number /// of names in the index. Each entry represents the length, in terms, of each /// name. /// /// The lengths are used during scoring to compute a normalization term. This /// allows the scoring mechanism to take document length into account. const NORMS: &str = "names.norms.idx"; /// The external identifier for every distinct record represented by this name /// index. There are no restrictions on name ids, and multiple names may be /// indexed that correspond to the same name id. /// /// With respect to IMDb, there is a 1-to-1 correspondence between the records /// in title.basics.tsv and the set of NameIDs, even though there may be /// multiple names for each record. /// /// For IMDb, this is represented by the byte offset of the corresponding /// record in title.basics.tsv. This provides constant time lookup to full /// record. Note, though, that this module knows nothing about such things. /// To this module, name ids are opaque identifiers. pub type NameID = u64; /// An internal surrogate identifier for every distinct name in the index. Note /// that multiple distinct doc ids can map to the same name id. For example, if /// a name has multiple distinct forms, then they each get their own docid, but /// each of the docids will map to the same name id. /// /// The reason why we need DocID in addition to NameID is two fold: /// /// 1. Firstly, we'd like each name variant to have its own term frequency /// count. If every variant shared the same internal id, then names with /// multiple variants would behave as if they were one long name with each /// variant concatenated together. Our ranking scheme takes document length /// into account, so we don't want this. /// 2. Secondly, using an internal ID gives us control over the structure of /// those ids. For example, we can declare them to be a sorted sequence of /// increasing integers. This lets us traverse our postings more efficiently /// during search. type DocID = u32; /// The maximum docid allowed. /// /// When writing postings, we pack docids and their term frequency counts into /// a single u32. We give 4 bits for frequency and 28 bits for docid. That /// means we can permit up to 268,435,455 = (1<<28)-1 names, which is plenty /// for all unique names in IMDb. const MAX_DOC_ID: DocID = (1 << 28) - 1; /// A query for searching the name index. /// /// A query provides the name query and defines the maximum number of results /// returned by searching the name index. #[derive(Clone, Debug)] pub struct NameQuery { name: String, size: usize, scorer: NameScorer, stop_word_ratio: f64, } impl NameQuery { /// Create a query that searches the given name. pub fn new(name: &str) -> NameQuery { NameQuery { name: name.to_string(), size: 30, scorer: NameScorer::default(), stop_word_ratio: 0.01, } } /// Set this query's result set size. At most `size` results will be /// returned when searching with this query. pub fn with_size(self, size: usize) -> NameQuery { NameQuery { size, ..self } } /// Set this query's scorer. By default, Okapi BM25 is used. pub fn with_scorer(self, scorer: NameScorer) -> NameQuery { NameQuery { scorer, ..self } } /// Set the ratio (in the range `0.0` to `1.0`, inclusive) at which a term /// is determined to be a stop word. Set to `0.0` to disable. By default /// this is set to a non-zero value. /// /// This ratio is used at query time to partition all of the ngrams in the /// query into two bins: one bin is for "low frequency" ngrams while the /// other is for "high frequency" ngrams. The partitioning is determined /// by this ratio. Namely, if an ngram occurs in fewer than `ratio` /// documents in the entire corpus, then it is considered a low frequency /// ngram. /// /// Once these two partitions are created, both are used to create two /// disjunction queries. The low frequency query drives search results, /// while the high frequency query is only used to boost scores when it /// matches a result yielded by the low frequency query. Otherwise, results /// from the high frequency query aren't considered. pub fn with_stop_word_ratio(self, ratio: f64) -> NameQuery { NameQuery { stop_word_ratio: ratio, ..self } } } /// A reader for the name index. #[derive(Debug)] pub struct IndexReader { /// The configuration of this index. This is how we determine index-time /// settings automatically, such as ngram size and type. config: Config, /// The ngram index, also known more generally as the "term index." It maps /// terms (which are ngrams for this index) to offsets into the postings /// file. The offset indicates the start of a list of document ids /// containing that term. ngram: fst::Map, /// The postings. This corresponds to a sequence of lists, where each list /// is a list of document ID/frequency pairs. Each list corresponds to the /// document ids containing a particular term. The beginning of each list /// is pointed to by an offset in the term index. postings: Mmap, /// A sequence of 64-bit little-endian encoded integers that provide a /// map from document ID to name ID. The document ID is an internal /// identifier assigned to each unique name indexed, while the name ID is /// an external identifier provided by users of this index. /// /// This map is used to return name IDs to callers. Namely, results are /// natively represented by document IDs, but they are mapped to name IDs /// during collection of results and subsequently deduped. In particular, /// multiple document IDs can map to the same name ID. /// /// The number of entries in this map is equivalent to the total number of /// names indexed. idmap: Mmap, /// A sequence of 16-bit little-endian encoded integers indicating the /// document length (in terms) of the correspond document ID. /// /// The number of entries in this map is equivalent to the total number of /// names indexed. norms: Mmap, } /// The configuration for this name index. It is JSON encoded to disk. /// /// Note that we don't track the version here. Instead, it is tracked wholesale /// as part of the parent index. #[derive(Debug, Deserialize, Serialize)] struct Config { ngram_type: NgramType, ngram_size: usize, avg_document_len: f64, num_documents: u64, } impl IndexReader { /// Open a name index in the given directory. pub fn open>(dir: P) -> Result { let dir = dir.as_ref(); // All of the following open memory maps. We claim it is safe because // we don't mutate them and no other process (should) either. let ngram = unsafe { fst_map_file(dir.join(NGRAM))? }; let postings = unsafe { mmap_file(dir.join(POSTINGS))? }; let idmap = unsafe { mmap_file(dir.join(IDMAP))? }; let norms = unsafe { mmap_file(dir.join(NORMS))? }; let config_file = open_file(dir.join(CONFIG))?; let config: Config = serde_json::from_reader(config_file) .map_err(|e| Error::config(e.to_string()))?; Ok(IndexReader { config, ngram, postings, idmap, norms }) } /// Execute a search. pub fn search(&self, query: &NameQuery) -> SearchResults { let start = Instant::now(); let mut searcher = Searcher::new(self, query); let results = CollectTopK::new(query.size).collect(&mut searcher); log::debug!( "search for {:?} took {}", query, NiceDuration::since(start) ); results } /// Return the name ID used to the index the given document id. /// /// This panics if the given document id does not correspond to an indexed /// document. fn docid_to_nameid(&self, docid: DocID) -> NameID { let start = 8 * (docid as usize); let buf = self.idmap[start..start + 8].try_into().unwrap(); u64::from_le_bytes(buf) } /// Return the length, in terms, of the given document. /// /// This panics if the given document id does not correspond to an indexed /// document. fn document_length(&self, docid: DocID) -> u64 { let start = 2 * (docid as usize); let buf = self.norms[start..start + 2].try_into().unwrap(); u16::from_le_bytes(buf) as u64 } } /// A collector for gathering the top K results from a search. /// /// This maintains a min-heap of search results. When a new result is /// considered, it is compared against the worst result in the heap. If the /// candidate is worse, then it is discarded. Otherwise, it is shuffled into /// the heap. struct CollectTopK { /// The total number of hits to collect. k: usize, /// The min-heap, according to score. Note that since BinaryHeap is a /// max-heap by default, we reverse the comparison to get a min-heap. queue: BinaryHeap>>, /// A set for deduplicating results. Namely, multiple doc IDs can map to /// the same name ID. This set makes sure we only collect one name ID. /// /// We map name IDs to scores. In this way, we always report the best /// scoring match. byid: FnvHashMap, } impl CollectTopK { /// Build a new collector that collects at most `k` results. fn new(k: usize) -> CollectTopK { CollectTopK { k, queue: BinaryHeap::with_capacity(k), byid: FnvHashMap::default(), } } /// Collect the top K results from the given searcher using the given /// index reader. Return the results with normalized scores sorted in /// order of best-to-worst. fn collect(mut self, searcher: &mut Searcher) -> SearchResults { if self.k == 0 { return SearchResults::new(); } let index = searcher.index(); let (mut count, mut push_count) = (0, 0); for scored_with_docid in searcher { count += 1; let scored = scored_with_docid.map(|v| index.docid_to_nameid(v)); // Since multiple names can correspond to a single IMDb title, // we dedup our results here. That is, if our result set // already contains this result, then update the score if need // be, and then move on. if let Some(&score) = self.byid.get(scored.value()) { if scored.score() > score { self.byid.insert(*scored.value(), scored.score()); } continue; } let mut dopush = self.queue.len() < self.k; if !dopush { // This unwrap is OK because k > 0 and queue is non-empty. let worst = self.queue.peek_mut().unwrap(); // If our queue is full, then we should only push if this // doc id has a better score than the worst one in the queue. if worst.0 < scored { self.byid.remove(worst.0.value()); binary_heap::PeekMut::pop(worst); dopush = true; } } if dopush { push_count += 1; self.byid.insert(*scored.value(), scored.score()); self.queue.push(cmp::Reverse(scored)); } } log::debug!( "collect count: {:?}, collect push count: {:?}", count, push_count ); // Pull out the results from our heap and normalize the scores. let mut results = SearchResults::from_min_heap(&mut self.queue); results.normalize(); results } } /// A searcher for resolving fulltext queries. /// /// A searcher takes a fulltext query, usually typed by an end user, along with /// a scoring function and produces a stream of matching results with scores /// computed via the provided function. Results are always yielded in /// ascending order with respect to document IDs, which are internal IDs /// assigned to each name in the index. /// /// This searcher combines a bit of smarts to handle stop words, usually /// referred to as "dynamic stop word detection." Namely, after the searcher /// splits the query into ngrams, it partitions the ngrams into infrequently /// occurring ngrams and frequently occurring ngrams, according to some /// hard-coded threshold. Each group is then turned into a `Disjunction` /// query. The searcher then visits every doc ID that matches the infrequently /// occurring disjunction. When a score is computed for a doc ID, then its /// score is increased if the frequently occurring disjunction also contains /// that same doc ID. Otherwise, the frequently occurring disjunction isn't /// consulted at all, which permits skipping the score calculation for a /// potentially large number of doc IDs. /// /// When two partitions cannot be created (e.g., all of the terms are /// infrequently occurring or all of the terms are frequently occurring), then /// only one disjunction query is used and no skipping logic is employed. That /// means that a query consisting of all high frequency terms could be quite /// slow. /// /// This does of course sacrifice recall for a performance benefit, but so do /// all filtering strategies based on stop words. The benefit of this "dynamic" /// approach is that stop word detection is tailored exactly to the corpus, and /// that stop words can still influence scoring. That means queries like "the /// matrix" will match "The Matrix" better than "Matrix" (which is a legitimate /// example, try it). struct Searcher<'i> { /// A handle to the index. index: &'i IndexReader, /// The primary disjunction query that drives results. Typically, this /// corresponds to the infrequent terms in the query. primary: Disjunction<'i>, /// A disjunction of only high frequency terms. When the query consists /// of exclusively high frequency terms, then this is empty (which matches /// nothing) and `primary` is set to the disjunction of terms. high: Disjunction<'i>, } impl<'i> Searcher<'i> { /// Create a new searcher. fn new(idx: &'i IndexReader, query: &NameQuery) -> Searcher<'i> { let num_docs = idx.config.num_documents as f64; let (mut low, mut high) = (vec![], vec![]); let (mut low_terms, mut high_terms) = (vec![], vec![]); let name = normalize_query(&query.name); let mut query_len = 0; let mut multiset = FnvHashMap::default(); idx.config.ngram_type.iter(idx.config.ngram_size, &name, |term| { *multiset.entry(term).or_insert(0) += 1; query_len += 1; }); for (term, &count) in multiset.iter() { let postings = PostingIter::new(idx, query.scorer, count, term); let ratio = (postings.len() as f64) / num_docs; if ratio < query.stop_word_ratio { low.push(postings); low_terms.push(format!("{}:{}:{:0.6}", term, count, ratio)); } else { high.push(postings); high_terms.push(format!("{}:{}:{:0.6}", term, count, ratio)); } } log::debug!("starting search for: {:?}", name); log::debug!("{:?} low frequency terms: {:?}", low.len(), low_terms); log::debug!("{:?} high frequency terms: {:?}", high.len(), high_terms); if low.is_empty() { Searcher { index: idx, primary: Disjunction::new(idx, query_len, query.scorer, high), high: Disjunction::empty(idx, query.scorer), } } else { Searcher { index: idx, primary: Disjunction::new(idx, query_len, query.scorer, low), high: Disjunction::new(idx, query_len, query.scorer, high), } } } /// Return a reference to the underlying index reader. fn index(&self) -> &'i IndexReader { self.index } } impl<'i> Iterator for Searcher<'i> { type Item = Scored; fn next(&mut self) -> Option> { // This is pretty simple. We drive the iterator via the primary // disjunction, which is usually a disjunction of infrequently // occurring ngrams. let mut scored = match self.primary.next() { None => return None, Some(scored) => scored, }; // We then skip our frequently occurring disjunction to the doc ID // yielded above. Any frequently occurring ngrams found then improve // this score. This makes queries like 'the matrix' match 'The Matrix' // better than 'Matrix'. if let Some(other_scored) = self.high.skip_to(*scored.value()) { scored = scored.map_score(|s| s + other_scored.score()); } Some(scored) } } /// A disjunction over a collection of ngrams. A disjunction yields scored /// document IDs for every document that contains any of the terms in this /// disjunction. The more ngrams that match the document in the disjunction, /// the better the score. struct Disjunction<'i> { /// A handle to the underlying index that we're searching. index: &'i IndexReader, /// The number of ngrams in the original query. /// /// This is not necessarily equivalent to the number of ngrams in this /// specific disjunction. Namely, this is used to compute scores, and it /// is important that scores are computed using the total number of ngrams /// and not the number of ngrams in a specific disjunction. For example, /// if a query consisted of 8 infrequent ngrams and 1 frequent ngram, then /// the disjunction containing the single frequent ngram would contribute a /// disproportionately high score. query_len: f64, /// The scoring function to use. scorer: NameScorer, /// A min-heap of posting iterators. Each posting iterator corresponds to /// an iterator over (doc ID, frequency) pairs for a single ngram, sorted /// by doc ID in ascending order. /// /// A min-heap is a classic way of optimally computing a disjunction over /// an arbitrary number of ordered streams. queue: BinaryHeap>, /// Whether this disjunction has been exhausted or not. is_done: bool, } impl<'i> Disjunction<'i> { /// Create a new disjunction over the given posting iterators. fn new( index: &'i IndexReader, query_len: usize, scorer: NameScorer, posting_iters: Vec>, ) -> Disjunction<'i> { let mut queue = BinaryHeap::new(); for postings in posting_iters { queue.push(postings); } let is_done = queue.is_empty(); let query_len = query_len as f64; Disjunction { index, query_len, scorer, queue, is_done } } /// Create an empty disjunction that never matches anything. fn empty(index: &'i IndexReader, scorer: NameScorer) -> Disjunction<'i> { Disjunction { index, query_len: 0.0, scorer, queue: BinaryHeap::new(), is_done: true, } } /// Skip this disjunction such that all posting iterators are either /// positioned at the smallest doc ID greater than the given doc ID. /// /// If any posting iterator contains the given doc ID, then it is scored /// and returned. The score incorporates all posting iterators that contain /// the given doc ID. fn skip_to(&mut self, target_docid: DocID) -> Option> { if self.is_done { return None; } let mut found = false; // loop invariant: loop until all posting iterators are either // positioned directly at the target doc ID (in which case, `found` // is set to that doc ID) or beyond the target doc ID. If none of the // iterators contain the target doc ID, then `found` remains `None`. loop { // This unwrap is OK because we're only here if we have a // non-empty queue. let mut postings = self.queue.peek_mut().unwrap(); if postings.docid().map_or(true, |x| x >= target_docid) { found = found || Some(target_docid) == postings.docid(); // This is the smallest posting iterator, which means all // iterators are now either at or beyond target_docid. break; } // Skip through this iterator until we're at or beyond the target // doc ID. while postings.docid().map_or(false, |x| x < target_docid) { postings.next(); } found = found || Some(target_docid) == postings.docid(); } if !found { return None; } // We're here if we found our target doc ID, which means at least one // posting iterator is pointing to the doc ID and it is necessarily // the minimum doc ID of all the posting iterators in this disjunction. // Therefore, advance such that all posting iterators are beyond the // target doc ID. // // (If we didn't find the target doc ID, then the loop invariant above // guarantees that we are already passed the target doc ID.) self.next() } } impl<'i> Iterator for Disjunction<'i> { type Item = Scored; fn next(&mut self) -> Option> { if self.is_done { return None; } // Find our next matching ngram. let mut scored1 = { // This unwrap is OK because we're only here if we have a // non-empty queue. let mut postings = self.queue.peek_mut().unwrap(); match postings.score() { None => { self.is_done = true; return None; } Some(scored) => { postings.next(); scored } } }; // Discover if any of the other posting iterators also match this // ngram. loop { // This unwrap is OK because we're only here if we have a // non-empty queue. let mut postings = self.queue.peek_mut().unwrap(); match postings.score() { None => break, Some(scored2) => { // If the smallest posting iterator isn't equivalent to // the doc ID found above, then we've found all of the // matching terms for this doc ID that we'll find. if scored1.value() != scored2.value() { break; } scored1 = scored1.map_score(|s| s + scored2.score()); postings.next(); } } } // Some of our scorers are more convenient to compute at the // disjunction level rather than at the term level. if let NameScorer::Jaccard = self.scorer { // When using Jaccard, the score returned by the posting // iterator is always 1. Thus, `scored.score` represents the // total number of terms that matched this document. In other // words, it is the cardinality of the intersection of terms // between the query and our candidate, `|A ∩ B|`. // // `query_len` represents the total number of terms in our query // (not just the number of terms in this disjunction!), and // `doc_len` represents the total number of terms in our candidate. // Thus, since `|A u B| = |A| + |B| - |A ∩ B|`, we have that // `|A u B| = query_len + doc_len - scored.score`. And finally, the // Jaccard index is `|A ∩ B| / |A u B|`. let doc_len = self.index.document_length(*scored1.value()) as f64; let union = self.query_len + doc_len - scored1.score(); scored1 = scored1.map_score(|s| s / union); } else if let NameScorer::QueryRatio = self.scorer { // This is like Jaccard, but our score is computely purely as the // ratio of query terms that matched this document. scored1 = scored1.map_score(|s| s / self.query_len) } Some(scored1) } } /// An iterator over a postings list for a specific ngram. /// /// A postings list is a sequence of pairs, where each pair has a document /// ID and a frequency. The document ID indicates that the ngram is in the /// text indexed for that ID, and the frequency counts the number of times /// that ngram occurs in the document. /// /// To save space, each pair is encoded using 32 bits. Frequencies are capped /// at a maximum of 15, which fit into the high 4 bits. The low 28 bits contain /// the doc ID. /// /// The postings list starts with a single 32-bit little endian /// integer that represents the document frequency of the ngram. This in turn /// determines how many pairs to read. In other words, a posting list is a /// length prefixed array of 32 bit little endian integer values. /// /// This type is intended to be used in a max-heap, and orients its Ord /// definition such that the heap becomes a min-heap. The ordering criteria /// is derived from only the docid. #[derive(Clone)] struct PostingIter<'i> { /// A handle to the underlying index. index: &'i IndexReader, /// The scoring function to use. scorer: NameScorer, /// The number of times the term for these postings appeared in the /// original query. This increases the score proportionally. count: f64, /// The raw bytes of the posting list. The number of bytes is /// exactly equivalent to `4 * document-frequency(ngram)`, where /// `document-frequency(ngram)` is the total number of documents in which /// `ngram` occurs. /// /// This does not include the length prefix. postings: &'i [u8], /// The document frequency of this term. len: usize, /// The current posting. This is `None` once this iterator is exhausted. posting: Option, /// A docid used for sorting postings. When the iterator is exhausted, /// this is greater than the maximum doc id. Otherwise, this is always /// equivalent to posting.docid. /// /// We do this for efficiency by avoiding going through the optional /// Posting. docid: DocID, /// The OkapiBM25 IDF score. This is invariant across all items in a /// posting list, so we compute it once at construction. This saves a /// call to `log` for every doc ID visited. okapi_idf: f64, } /// A single entry in a posting list. #[derive(Clone, Copy, Debug)] struct Posting { /// The document id. docid: DocID, /// The frequency, i.e., the number of times the ngram occurred in the /// document identified by the docid. frequency: u32, } impl Posting { /// Read the next posting pair (doc ID and frequency) from the given /// postings list. If the list is empty, then return `None`. fn read(slice: &[u8]) -> Option { if slice.is_empty() { None } else { let v = read_le_u32(slice); Some(Posting { docid: v & MAX_DOC_ID, frequency: v >> 28 }) } } } impl<'i> PostingIter<'i> { /// Create a new posting iterator for the given term in the given index. /// Scores will be computed with the given scoring function. /// /// `count` should be the number of times this term occurred in the /// original query string. fn new( index: &'i IndexReader, scorer: NameScorer, count: usize, term: &str, ) -> PostingIter<'i> { let mut postings = &*index.postings; let offset = match index.ngram.get(term.as_bytes()) { Some(offset) => offset as usize, None => { // If the term isn't in the index, then return an exhausted // iterator. return PostingIter { index, scorer, count: 0.0, postings: &[], len: 0, posting: None, docid: MAX_DOC_ID + 1, okapi_idf: 0.0, }; } }; postings = &postings[offset..]; let len = read_le_u32(postings) as usize; postings = &postings[4..]; let corpus_count = index.config.num_documents as f64; let df = len as f64; let okapi_idf = (1.0 + (corpus_count - df + 0.5) / (df + 0.5)).log2(); let mut it = PostingIter { index, scorer, count: count as f64, postings: &postings[..4 * len], len, posting: None, docid: 0, okapi_idf, }; // Advance to the first posting. it.next(); it } /// Return the current posting. If this iterator has been exhausted, then /// this returns `None`. fn posting(&self) -> Option { self.posting } /// Returns the document frequency for the term corresponding to these /// postings. fn len(&self) -> usize { self.len } /// Return the current document ID. If this iterator has been exhausted, /// then this returns `None`. fn docid(&self) -> Option { self.posting().map(|p| p.docid) } /// Return the score with the current document ID. If this iterator has /// been exhausted, then this returns `None`. fn score(&self) -> Option> { match self.scorer { NameScorer::OkapiBM25 => self.score_okapibm25(), NameScorer::TFIDF => self.score_tfidf(), NameScorer::Jaccard => self.score_jaccard(), NameScorer::QueryRatio => self.score_query_ratio(), } .map(|scored| scored.map_score(|s| s * self.count)) } /// Score the current doc ID using Okapi BM25. It's similarish to TF-IDF, /// but uses a document length normalization term. fn score_okapibm25(&self) -> Option> { let post = match self.posting() { None => return None, Some(post) => post, }; let k1 = 1.2; let b = 0.75; let doc_len = self.index.document_length(post.docid); let norm = (doc_len as f64) / self.index.config.avg_document_len; let tf = post.frequency as f64; let num = tf * (k1 + 1.0); let den = tf + k1 * (1.0 - b + b * norm); let score = (num / den) * self.okapi_idf; let capped = if score < 0.0 { 0.0 } else { score }; Some(Scored::new(post.docid).with_score(capped)) } /// Score the current doc ID using the traditional TF-IDF ranking function. fn score_tfidf(&self) -> Option> { let post = match self.posting() { None => return None, Some(post) => post, }; let corpus_docs = self.index.config.num_documents as f64; let term_docs = self.len as f64; let tf = post.frequency as f64; let idf = (corpus_docs / (1.0 + term_docs)).log2(); let score = tf * idf; Some(Scored::new(post.docid).with_score(score)) } /// Score the current doc ID using the Jaccard index, which measures the /// overlap between two sets. /// /// Note that this always returns `1.0`. The Jaccard index itself must be /// computed by the disjunction scorer. fn score_jaccard(&self) -> Option> { self.posting().map(|p| Scored::new(p.docid).with_score(1.0)) } /// Score the current doc ID using the ratio of terms in the query that /// matched the terms in this doc ID. /// /// Note that this always returns `1.0`. The query ratio itself must be /// computed by the disjunction scorer. fn score_query_ratio(&self) -> Option> { self.posting().map(|p| Scored::new(p.docid).with_score(1.0)) } } impl<'i> Iterator for PostingIter<'i> { type Item = Posting; fn next(&mut self) -> Option { self.posting = match Posting::read(self.postings) { None => { self.docid = MAX_DOC_ID + 1; None } Some(p) => { self.postings = &self.postings[4..]; self.docid = p.docid; Some(p) } }; self.posting } } impl<'i> Eq for PostingIter<'i> {} impl<'i> PartialEq for PostingIter<'i> { fn eq(&self, other: &PostingIter<'i>) -> bool { self.docid == other.docid } } impl<'i> Ord for PostingIter<'i> { fn cmp(&self, other: &PostingIter<'i>) -> cmp::Ordering { // std::collections::BinaryHeap is a max-heap and we need a // min-heap, so write this as-if it were a max-heap, then reverse it. // Note that exhausted searchers should always have the lowest // priority, and therefore, be considered maximal. self.docid.cmp(&other.docid).reverse() } } impl<'i> PartialOrd for PostingIter<'i> { fn partial_cmp(&self, other: &PostingIter<'i>) -> Option { Some(self.cmp(other)) } } /// A writer for indexing names to disk. /// /// A writer opens and writes to several files simultaneously, which keeps the /// implementation simple. /// /// The index writer cannot stream the postings or term index, since the term /// index requires its ngrams to be inserted in sorted order. Postings lists /// are written as length prefixed sequences, so we need to know the lengths /// of all our postings lists before writing them. pub struct IndexWriter { /// A builder for the ngram term index. /// /// This isn't used until the caller indicates that it is done indexing /// names. At which point, we insert all ngrams into the FST in sorted /// order. Each ngram is mapped to the beginning of its correspond /// postings list. ngram: fst::MapBuilder>, /// The type of ngram extraction to use. ngram_type: NgramType, /// The size of ngrams to generate. ngram_size: usize, /// A writer for postings lists. /// /// This isn't written to until the caller indicates that it is done /// indexing names. At which point, every posting list is written as a /// length prefixed array, in the same order that terms are written to the /// term index. postings: CursorWriter>, /// A map from document ID to name ID. This is written to in a streaming /// fashion during indexing. The ID map consists of N 64-bit little /// endian integers, where N is the total number of names indexed. /// /// The document ID (the position in this map) is a unique internal /// identifier assigned to each name, while the name ID is an identifier /// provided by the caller. Multiple document IDs may map to the same /// name ID (e.g., for indexing alternate names). idmap: CursorWriter>, /// A map from document ID to document length, where the length corresponds /// to the number of ngrams in the document. The map consists of N 16-bit /// little endian integers, where N is the total number of names indexed. /// /// The document lengths are used at query time as normalization /// parameters. They are written in a streaming fashion during the indexing /// process. norms: CursorWriter>, /// A JSON formatted configuration file that includes some aggregate /// statistics (such as the average document length, in ngrams) and the /// ngram configuration. The ngram configuration in particular is used at /// query time to make sure that query-time uses the same analysis as /// index-time. /// /// This is written at the end of the indexing process. config: CursorWriter>, /// An in-memory map from ngram to its corresponding postings list. Once /// indexing is done, this is written to disk via the FST term index and /// postings list writers documented above. terms: FnvHashMap, /// The next document ID, starting at 0. Each name added gets assigned its /// own unique document ID. Queries read document IDs from the postings /// list, but are mapped back to name IDs using the `idmap` before being /// returned to the caller. next_docid: DocID, /// The average document length, in ngrams, for every name indexed. This is /// used along with document lengths to compute normalization terms for /// scoring at query time. avg_document_len: f64, } /// A single postings list. #[derive(Clone, Debug, Default)] struct Postings { /// A sorted list of postings, in order of ascending document IDs. list: Vec, } impl IndexWriter { /// Open an index for writing to the given directory. Any previous name /// index in the given directory is overwritten. /// /// The given ngram configuration is used to transform all indexed names /// into terms for the inverted index. pub fn open>( dir: P, ngram_type: NgramType, ngram_size: usize, ) -> Result { let dir = dir.as_ref(); let ngram = fst_map_builder_file(dir.join(NGRAM))?; let postings = CursorWriter::from_path(dir.join(POSTINGS))?; let idmap = CursorWriter::from_path(dir.join(IDMAP))?; let norms = CursorWriter::from_path(dir.join(NORMS))?; let config = CursorWriter::from_path(dir.join(CONFIG))?; Ok(IndexWriter { ngram, ngram_type, ngram_size, postings, idmap, norms, config, terms: FnvHashMap::default(), next_docid: 0, avg_document_len: 0.0, }) } /// Finish writing names and serialize the index to disk. pub fn finish(mut self) -> Result<()> { let num_docs = self.num_docs(); let mut ngram_to_postings: Vec<(String, Postings)> = self.terms.into_iter().collect(); // We could use a BTreeMap and get out our keys in sorted order, but // the overhead of inserting into the BTreeMap dwarfs the savings we // get from pre-sorted keys. ngram_to_postings.sort_by(|&(ref t1, _), &(ref t2, _)| t1.cmp(t2)); for (term, postings) in ngram_to_postings { let pos = self.postings.position() as u64; self.ngram.insert(term.as_bytes(), pos).map_err(Error::fst)?; self.postings .write_u32(postings.list.len() as u32) .map_err(Error::io)?; for posting in postings.list { let freq = cmp::min(15, posting.frequency); let v = (freq << 28) | posting.docid; self.postings.write_u32(v).map_err(Error::io)?; } } serde_json::to_writer_pretty( &mut self.config, &Config { ngram_type: self.ngram_type, ngram_size: self.ngram_size, avg_document_len: self.avg_document_len, num_documents: num_docs as u64, }, ) .map_err(|e| Error::config(e.to_string()))?; self.ngram.finish().map_err(Error::fst)?; self.idmap.flush().map_err(Error::io)?; self.postings.flush().map_err(Error::io)?; self.norms.flush().map_err(Error::io)?; self.config.flush().map_err(Error::io)?; Ok(()) } /// Inserts the given name to this index, and associates it with the /// provided `NameID`. Multiple names may be associated with the same /// `NameID`. pub fn insert(&mut self, name_id: NameID, name: &str) -> Result<()> { let docid = self.next_docid(name_id)?; let name = normalize_query(name); let mut count = 0u16; // document length in number of ngrams self.ngram_type.clone().iter(self.ngram_size, &name, |ngram| { self.insert_term(docid, ngram); // If a document length exceeds 2^16, then it is far too long for // a name anyway, so we cap it at 2^16. count = count.saturating_add(1); }); // Update our mean document length (in ngrams). self.avg_document_len += (count as f64 - self.avg_document_len) / (self.num_docs() as f64); // Write the document length to disk, which is used as a normalization // term for some scorers (like Okapi-BM25). self.norms.write_u16(count).map_err(Error::io)?; Ok(()) } /// Add a single term that is part of a name identified by the given docid. /// This updates the postings for this term, or creates a new posting if /// this is the first time this term has been seen. fn insert_term(&mut self, docid: DocID, term: &str) { if let Some(posts) = self.terms.get_mut(term) { posts.posting(docid).frequency += 1; return; } let mut list = Postings::default(); list.posting(docid).frequency = 1; self.terms.insert(term.to_string(), list); } /// Retrieve a fresh doc id, and associate it with the given name id. fn next_docid(&mut self, name_id: NameID) -> Result { let docid = self.next_docid; self.idmap.write_u64(name_id).map_err(Error::io)?; self.next_docid = match self.next_docid.checked_add(1) { None => bug!("exhausted doc ids"), Some(next_docid) => next_docid, }; if self.next_docid > MAX_DOC_ID { let max = MAX_DOC_ID + 1; // docids are 0-indexed bug!("exceeded maximum number of names ({})", max); } Ok(docid) } /// Return the total number of documents have been assigned doc ids. fn num_docs(&self) -> u32 { self.next_docid } } impl Postings { /// Return a mutable reference to the posting for the given docid. If one /// doesn't exist, then create one (with a zero frequency) and return it. fn posting(&mut self, docid: DocID) -> &mut Posting { if self.list.last().map_or(true, |x| x.docid != docid) { self.list.push(Posting { docid, frequency: 0 }); } // This unwrap is OK because if the list was empty when this method was // called, then we added an element above, and is thus now non-empty. self.list.last_mut().unwrap() } } /// The type of scorer that the name index should use. /// /// The default is OkapiBM25. If you aren't sure which scorer to use, then /// stick with the default. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub enum NameScorer { /// OkapiBM25 is a TF-IDF-like ranking function, which takes name length /// into account. OkapiBM25, /// TFIDF is the traditional TF-IDF ranking function, which does not /// incorporate document length. TFIDF, /// Jaccard is a ranking function determined by computing the similarity /// of ngrams between the query and a name in the index. The similarity /// is computed by dividing the number of ngrams in common by the total /// number of distinct ngrams in both the query and the name combined. Jaccard, /// QueryRatio is a ranking function that represents the ratio of query /// terms that matched a name. It is computed by dividing the number of /// ngrams in common by the total number of ngrams in the query only. QueryRatio, } impl NameScorer { /// Returns a list of strings representing the possible scorer values. pub fn possible_names() -> &'static [&'static str] { &["okapibm25", "tfidf", "jaccard", "queryratio"] } /// Return a string representation of this scorer. /// /// The string returned can be parsed back into a `NameScorer`. pub fn as_str(&self) -> &'static str { match *self { NameScorer::OkapiBM25 => "okapibm25", NameScorer::TFIDF => "tfidf", NameScorer::Jaccard => "jaccard", NameScorer::QueryRatio => "queryratio", } } } impl Default for NameScorer { fn default() -> NameScorer { NameScorer::OkapiBM25 } } impl fmt::Display for NameScorer { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.as_str()) } } impl FromStr for NameScorer { type Err = Error; fn from_str(s: &str) -> Result { match s { "okapibm25" => Ok(NameScorer::OkapiBM25), "tfidf" => Ok(NameScorer::TFIDF), "jaccard" => Ok(NameScorer::Jaccard), "queryratio" => Ok(NameScorer::QueryRatio), unk => Err(Error::unknown_scorer(unk)), } } } /// The style of ngram extraction to use. /// /// The same style of ngram extraction is always used at index time and at /// query time. /// /// Each ngram type uses the ngram size configuration differently. /// /// All ngram styles used Unicode codepoints as the definition of a character. /// For example, a 3-gram might contain up to 4 bytes, if it contains 3 Unicode /// codepoints that each require 4 UTF-8 code units. #[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] pub enum NgramType { /// A windowing ngram. /// /// This is the tradition style of ngram, where sliding window of size /// `N` is moved across the entire content to be index. For example, the /// 3-grams for the string `homer` are hom, ome and mer. #[serde(rename = "window")] Window, /// An edge ngram. /// /// This style of ngram produces ever longer ngrams, where each ngram is /// anchored to the start of a word. Words are determined simply by /// splitting whitespace. /// /// For example, the edge ngrams of `homer simpson`, where the max ngram /// size is 5, would be: hom, home, homer, sim, simp, simps. Generally, /// for this ngram type, one wants to use a large maximum ngram size. /// Perhaps somewhere close to the maximum number of ngrams in any word /// in the corpus. /// /// Note that there is no way to set the minimum ngram size (which is 3). #[serde(rename = "edge")] Edge, } /// The minimum size of an ngram emitted by the edge ngram iterator. const MIN_EDGE_NGRAM_SIZE: usize = 3; impl NgramType { /// Return all possible ngram types. pub fn possible_names() -> &'static [&'static str] { &["window", "edge"] } /// Return a string representation of this type. pub fn as_str(&self) -> &'static str { match *self { NgramType::Window => "window", NgramType::Edge => "edge", } } /// Execute the given function over each ngram in the text provided using /// the given size configuration. /// /// We don't use normal Rust iterators here because an internal iterator /// is much easier to implement. fn iter<'t, F: FnMut(&'t str)>(&self, size: usize, text: &'t str, f: F) { match *self { NgramType::Window => NgramType::iter_window(size, text, f), NgramType::Edge => NgramType::iter_edge(size, text, f), } } fn iter_window<'t, F: FnMut(&'t str)>( size: usize, text: &'t str, mut f: F, ) { if size == 0 { return; } let end_skip = text.chars().take(size).count().saturating_sub(1); let start = text.char_indices(); let end = text.char_indices().skip(end_skip); for ((s, _), (e, c)) in start.zip(end) { f(&text[s..e + c.len_utf8()]); } } fn iter_edge<'t, F: FnMut(&'t str)>( max_size: usize, text: &'t str, mut f: F, ) { if max_size == 0 { return; } for word in text.split_whitespace() { let end_skip = word .chars() .take(MIN_EDGE_NGRAM_SIZE) .count() .saturating_sub(1); let mut size = end_skip + 1; for (end, c) in word.char_indices().skip(end_skip) { f(&word[..end + c.len_utf8()]); size += 1; if size > max_size { break; } } } } } impl Default for NgramType { fn default() -> NgramType { NgramType::Window } } impl fmt::Display for NgramType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.as_str()) } } impl FromStr for NgramType { type Err = Error; fn from_str(s: &str) -> Result { match s { "window" => Ok(NgramType::Window), "edge" => Ok(NgramType::Edge), unk => Err(Error::unknown_ngram_type(unk)), } } } fn normalize_query(s: &str) -> String { // We might consider doing Unicode normalization here, but it probably // doesn't matter too much on a predominantly ASCII data set. s.to_lowercase() } fn read_le_u32(slice: &[u8]) -> u32 { u32::from_le_bytes(slice[..4].try_into().unwrap()) } #[cfg(test)] mod tests { use super::*; use crate::index::tests::TestContext; // Test the actual name index. /// Creates a name index, where each name provided is assigned its own /// unique ID, starting at 0. fn create_index(index_dir: &Path, names: &[&str]) -> IndexReader { let mut wtr = IndexWriter::open(index_dir, NgramType::Window, 3).unwrap(); for (i, name) in names.iter().enumerate() { wtr.insert(i as u64, name).unwrap(); } wtr.finish().unwrap(); IndexReader::open(index_dir).unwrap() } /// Build a name query, and disable the dynamic stop word detection. /// /// It would be nice to test the stop word detection, but it makes writing /// unit tests very difficult unfortunately. fn name_query(name: &str) -> NameQuery { NameQuery::new(name).with_stop_word_ratio(0.0) } fn ids(results: &[Scored]) -> Vec { let mut ids: Vec<_> = results.iter().map(|r| *r.value()).collect(); ids.sort(); ids } /// Some names involving bruce. const BRUCES: &'static [&'static str] = &[ "Bruce Springsteen", // 0 "Bruce Kulick", // 1 "Bruce Arians", // 2 "Bruce Smith", // 3 "Bruce Willis", // 4 "Bruce Wayne", // 5 "Bruce Banner", // 6 ]; #[test] fn names_bruces_1() { let ctx = TestContext::new("small"); let idx = create_index(ctx.index_dir(), BRUCES); let query = name_query("bruce"); let results = idx.search(&query).into_vec(); // This query matches everything. assert_eq!(results.len(), 7); // The top two hits are the shortest documents, because of Okapi-BM25's // length normalization. assert_eq!(results[0].score(), 1.0); assert_eq!(results[1].score(), 1.0); assert_eq!(ids(&results[0..2]), vec![3, 5]); } #[test] fn names_bruces_2() { let ctx = TestContext::new("small"); let idx = create_index(ctx.index_dir(), BRUCES); let query = name_query("e w"); let results = idx.search(&query).into_vec(); // The 'e w' ngram is only in two documents: Bruce Willis and // Bruce Wayne. Since Wayne is shorter than Willis, it should always // be first. assert_eq!(results.len(), 2); assert_eq!(*results[0].value(), 5); assert_eq!(*results[1].value(), 4); } #[test] fn names_bruces_3() { let ctx = TestContext::new("small"); let idx = create_index(ctx.index_dir(), BRUCES); let query = name_query("Springsteen"); let results = idx.search(&query).into_vec(); assert_eq!(results.len(), 1); assert_eq!(*results[0].value(), 0); } #[test] fn names_bruces_4() { let ctx = TestContext::new("small"); let idx = create_index(ctx.index_dir(), BRUCES); let query = name_query("Springsteen Kulick Arians Smith Willis Wayne Banner"); let results = idx.search(&query).into_vec(); // This query should hit everything. assert_eq!(results.len(), 7); } // Test our various ngram strategies. fn ngrams_window(n: usize, text: &str) -> Vec<&str> { let mut grams = vec![]; NgramType::Window.iter(n, text, |gram| grams.push(gram)); grams } fn ngrams_edge(n: usize, text: &str) -> Vec<&str> { let mut grams = vec![]; NgramType::Edge.iter(n, text, |gram| grams.push(gram)); grams } #[test] #[should_panic] fn ngrams_window_zero_banned() { assert_eq!(ngrams_window(0, "abc"), vec!["abc"]); } #[test] fn ngrams_window_weird_sizes() { assert_eq!( ngrams_window(2, "abcdef"), vec!["ab", "bc", "cd", "de", "ef",] ); assert_eq!( ngrams_window(1, "abcdef"), vec!["a", "b", "c", "d", "e", "f",] ); assert_eq!(ngrams_window(2, "ab"), vec!["ab",]); assert_eq!(ngrams_window(1, "ab"), vec!["a", "b",]); assert_eq!(ngrams_window(1, "a"), vec!["a",]); assert_eq!(ngrams_window(1, ""), Vec::<&str>::new()); } #[test] fn ngrams_window_ascii() { assert_eq!( ngrams_window(3, "abcdef"), vec!["abc", "bcd", "cde", "def",] ); assert_eq!(ngrams_window(3, "abcde"), vec!["abc", "bcd", "cde",]); assert_eq!(ngrams_window(3, "abcd"), vec!["abc", "bcd",]); assert_eq!(ngrams_window(3, "abc"), vec!["abc",]); assert_eq!(ngrams_window(3, "ab"), vec!["ab",]); assert_eq!(ngrams_window(3, "a"), vec!["a",]); assert_eq!(ngrams_window(3, ""), Vec::<&str>::new()); } #[test] fn ngrams_window_non_ascii() { assert_eq!( ngrams_window(3, "αβγφδε"), vec!["αβγ", "βγφ", "γφδ", "φδε",] ); assert_eq!(ngrams_window(3, "αβγφδ"), vec!["αβγ", "βγφ", "γφδ",]); assert_eq!(ngrams_window(3, "αβγφ"), vec!["αβγ", "βγφ",]); assert_eq!(ngrams_window(3, "αβγ"), vec!["αβγ",]); assert_eq!(ngrams_window(3, "αβ"), vec!["αβ",]); assert_eq!(ngrams_window(3, "α"), vec!["α",]); } #[test] fn ngrams_edge_ascii() { assert_eq!( ngrams_edge(5, "homer simpson"), vec!["hom", "home", "homer", "sim", "simp", "simps",] ); assert_eq!(ngrams_edge(5, "h"), vec!["h",]); assert_eq!(ngrams_edge(5, "ho"), vec!["ho",]); assert_eq!(ngrams_edge(5, "hom"), vec!["hom",]); assert_eq!(ngrams_edge(5, "home"), vec!["hom", "home",]); } #[test] fn ngrams_edge_non_ascii() { assert_eq!( ngrams_edge(5, "δεαβγφδε δε"), vec!["δεα", "δεαβ", "δεαβγ", "δε",] ); } } ================================================ FILE: imdb-index/src/index/rating.rs ================================================ use std::path::Path; use fst::{IntoStreamer, Streamer}; use memmap::Mmap; use crate::error::{Error, Result}; use crate::record::Rating; use crate::util::{ csv_file, fst_set_builder_file, fst_set_file, IMDB_RATINGS, }; /// The name of the ratings index file. /// /// The ratings index maps IMDb title ID to their average rating and number of /// votes. The index is itself an FST set, where all keys begin with the IMDb /// title ID, and also contain the average rating and number votes. Thus, a /// lookup is accomplished via a range query on the title ID without needing /// to consult the original CSV data. const RATINGS: &str = "ratings.fst"; /// An index for ratings, which supports looking up ratings/votes for IMDb /// titles efficiently. #[derive(Debug)] pub struct Index { idx: fst::Set, } impl Index { /// Open a rating index from the given index directory. pub fn open>(index_dir: P) -> Result { Ok(Index { // We claim it is safe to open the following memory map because we // don't mutate them and no other process (should) either. idx: unsafe { fst_set_file(index_dir.as_ref().join(RATINGS))? }, }) } /// Create a rating index from the given IMDb data directory, and write it /// to the given index directory. If a rating index already exists, then it /// is overwritten. pub fn create, P2: AsRef>( data_dir: P1, index_dir: P2, ) -> Result { let data_dir = data_dir.as_ref(); let index_dir = index_dir.as_ref(); let mut buf = vec![]; let mut count = 0u64; let mut idx = fst_set_builder_file(index_dir.join(RATINGS))?; let mut rdr = csv_file(data_dir.join(IMDB_RATINGS))?; for result in rdr.deserialize() { let record: Rating = result.map_err(Error::csv)?; buf.clear(); write_rating(&record, &mut buf)?; idx.insert(&buf).map_err(Error::fst)?; count += 1; } idx.finish().map_err(Error::fst)?; log::info!("{} ratings indexed", count); Index::open(index_dir) } /// Return the rating information (which includes the actual rating and /// the number of votes associated with that rating) for the given IMDb /// identifier. If no rating information exists for the given ID, then /// `None` is returned. pub fn rating(&self, id: &[u8]) -> Result> { let mut upper = id.to_vec(); upper.push(0xFF); let mut stream = self.idx.range().ge(id).le(upper).into_stream(); while let Some(rating_bytes) = stream.next() { return Ok(Some(read_rating(rating_bytes)?)); } Ok(None) } } fn read_rating(bytes: &[u8]) -> Result { let nul = match bytes.iter().position(|&b| b == 0) { Some(nul) => nul, None => bug!("could not find nul byte"), }; let id = match String::from_utf8(bytes[..nul].to_vec()) { Err(err) => bug!("rating id invalid UTF-8: {}", err), Ok(tvshow_id) => tvshow_id, }; let i = nul + 1; Ok(Rating { id, rating: read_rating_value(&bytes[i..])?, votes: read_votes_value(&bytes[i + 4..])?, }) } fn write_rating(rat: &Rating, buf: &mut Vec) -> Result<()> { if rat.id.as_bytes().iter().any(|&b| b == 0) { bug!("unsupported rating id (with NUL byte) for {:?}", rat); } buf.extend_from_slice(rat.id.as_bytes()); buf.push(0x00); write_rating_value(rat.rating, buf); write_votes_value(rat.votes, buf); Ok(()) } fn read_votes_value(slice: &[u8]) -> Result { if slice.len() < 4 { bug!("not enough bytes to read votes value"); } Ok(u32::from_be_bytes(slice[..4].try_into().unwrap())) } fn write_votes_value(votes: u32, buf: &mut Vec) { buf.extend_from_slice(&votes.to_be_bytes()) } fn read_rating_value(slice: &[u8]) -> Result { if slice.len() < 4 { bug!("not enough bytes to read rating value"); } Ok(f32::from_be_bytes(slice[..4].try_into().unwrap())) } fn write_rating_value(rating: f32, buf: &mut Vec) { buf.extend_from_slice(&rating.to_be_bytes()) } #[cfg(test)] mod tests { use super::Index; use crate::index::tests::TestContext; #[test] fn basics() { let ctx = TestContext::new("small"); let idx = Index::create(ctx.data_dir(), ctx.index_dir()).unwrap(); let rat = idx.rating(b"tt0000001").unwrap().unwrap(); assert_eq!(rat.rating, 5.8); assert_eq!(rat.votes, 1356); assert!(idx.rating(b"tt9999999").unwrap().is_none()); } } ================================================ FILE: imdb-index/src/index/tests.rs ================================================ use std::path::{Path, PathBuf}; /// Create an error from a format!-like syntax. #[macro_export] macro_rules! err { ($($tt:tt)*) => { Box::::from(format!($($tt)*)) } } /// A convenient result type alias. pub type Result = std::result::Result>; /// A simple test context that makes it convenient to create an index. /// /// Each test context has an IMDb data directory (which usually has only a /// subset of the actual data) and an index directory (which starts empty by /// default). #[derive(Debug)] pub struct TestContext { _tmpdir: TempDir, data_dir: PathBuf, index_dir: PathBuf, } impl TestContext { /// Create a new test context using the test data set name given. /// /// Test data sets can be found in the `data/test` directory in this /// repository's root. Data set names are the names of sub-directories of /// `data`. pub fn new(name: &str) -> TestContext { let tmpdir = TempDir::new("imdb-rename-test-index").unwrap(); let data_dir = PathBuf::from("../data/test").join(name); let index_dir = tmpdir.path().to_path_buf(); TestContext { _tmpdir: tmpdir, data_dir, index_dir } } /// Return the path to the data directory for this context. pub fn data_dir(&self) -> &Path { &self.data_dir } /// Return the path to the index directory for this context. pub fn index_dir(&self) -> &Path { &self.index_dir } } /// A simple wrapper for creating a temporary directory that is automatically /// deleted when it's dropped. /// /// We use this in lieu of tempfile because tempfile brings in too many /// dependencies. #[derive(Debug)] pub struct TempDir(PathBuf); impl Drop for TempDir { fn drop(&mut self) { std::fs::remove_dir_all(&self.0).unwrap(); } } impl TempDir { /// Create a new empty temporary directory under the system's configured /// temporary directory. pub fn new(prefix: &str) -> Result { use std::sync::atomic::{AtomicUsize, Ordering}; static TRIES: usize = 100; static COUNTER: AtomicUsize = AtomicUsize::new(0); let tmpdir = std::env::temp_dir(); for _ in 0..TRIES { let count = COUNTER.fetch_add(1, Ordering::SeqCst); let path = tmpdir.join(prefix).join(count.to_string()); if path.is_dir() { continue; } std::fs::create_dir_all(&path).map_err(|e| { err!("failed to create {}: {}", path.display(), e) })?; return Ok(TempDir(path)); } Err(err!("failed to create temp dir after {} tries", TRIES)) } /// Return the underlying path to this temporary directory. pub fn path(&self) -> &Path { &self.0 } } ================================================ FILE: imdb-index/src/index/writer.rs ================================================ use std::fs::File; use std::io::{self, Write}; use std::path::Path; use crate::error::Result; use crate::util::create_file; /// Wraps any writer and records the current position in the writer. /// /// The position recorded always corresponds to the position that the next /// byte would be written to. #[derive(Clone, Debug)] pub struct CursorWriter { wtr: W, pos: usize, } impl CursorWriter> { /// Create a new cursor writer that will write to a file at the given path. /// The file is truncated before writing. pub fn from_path>(path: P) -> Result { let file = create_file(path)?; Ok(CursorWriter::new(io::BufWriter::new(file))) } } impl CursorWriter { /// Wrap the given writer with a counter. pub fn new(wtr: W) -> CursorWriter { CursorWriter { wtr, pos: 0 } } /// Return the current position of this writer. pub fn position(&self) -> usize { self.pos } /// Write a u16LE. pub fn write_u16(&mut self, n: u16) -> io::Result<()> { self.write_all(&n.to_le_bytes()) } /// Write a u32LE. pub fn write_u32(&mut self, n: u32) -> io::Result<()> { self.write_all(&n.to_le_bytes()) } /// Write a u64LE. pub fn write_u64(&mut self, n: u64) -> io::Result<()> { self.write_all(&n.to_le_bytes()) } } impl io::Write for CursorWriter { fn write(&mut self, buf: &[u8]) -> io::Result { let n = self.wtr.write(buf)?; self.pos += n; Ok(n) } fn flush(&mut self) -> io::Result<()> { self.wtr.flush() } } ================================================ FILE: imdb-index/src/lib.rs ================================================ /*! This crate provides an on-disk indexing data structure for searching IMDb. Searching is primarily done using information retrieval techniques, which support fuzzy name queries and using TF-IDF-like ranking functions. */ #![deny(missing_docs)] pub use crate::error::{Error, ErrorKind, Result}; pub use crate::index::{ AKARecordIter, Index, IndexBuilder, MediaEntity, NameQuery, NameScorer, NgramType, }; pub use crate::record::{Episode, Rating, Title, TitleKind, AKA}; pub use crate::scored::{Scored, SearchResults}; pub use crate::search::{Query, Searcher, Similarity}; // A macro that creates an error that represents a bug. // // This is typically used when reading index structures from disk. Since the // data on disk is generally outside our control, we return an error using this // macro instead of panicking (or worse, silently misinterpreting data). macro_rules! bug { ($($tt:tt)*) => {{ return Err($crate::error::Error::bug(format!($($tt)*))); }} } mod error; mod index; mod record; mod scored; mod search; mod util; ================================================ FILE: imdb-index/src/record.rs ================================================ use std::cmp; use std::fmt; use std::str::FromStr; use serde::{Deserialize, Deserializer, Serialize}; use crate::error::Error; /// An IMDb title record. /// /// This is the primary type of an IMDb media entry. This record defines the /// identifier of an IMDb title, which serves as a foreign key in other data /// files (such as alternate names, episodes and ratings). #[derive(Clone, Debug, Deserialize)] pub struct Title { /// An IMDb identifier. /// /// Generally, this is a fixed width string beginning with the characters /// `tt`. #[serde(rename = "tconst")] pub id: String, /// The specific type of a title, e.g., movie, TV show, episode, etc. #[serde(rename = "titleType")] pub kind: TitleKind, /// The primary name of this title. #[serde(rename = "primaryTitle")] pub title: String, /// The "original" name of this title. #[serde(rename = "originalTitle")] pub original_title: String, /// Whether this title is classified as "adult" material or not. #[serde(rename = "isAdult", deserialize_with = "number_as_bool")] pub is_adult: bool, /// The start year of this title. /// /// Generally, things like movies or TV episodes have a start year to /// indicate their release year and no end year. TV shows also have a start /// year. TV shows that are still airing lack an end time, but TV shows /// that have stopped will typically have an end year indicating when it /// stopped airing. /// /// Note that not all titles have a start year. #[serde(rename = "startYear", deserialize_with = "csv::invalid_option")] pub start_year: Option, /// The end year of this title. /// /// This is typically used to indicate the ending year of a TV show that /// has stopped production. #[serde(rename = "endYear", deserialize_with = "csv::invalid_option")] pub end_year: Option, /// The runtime, in minutes, of this title. #[serde( rename = "runtimeMinutes", deserialize_with = "csv::invalid_option" )] pub runtime_minutes: Option, /// A comma separated string of genres. #[serde(rename = "genres")] pub genres: String, } /// The kind of a title. These form a partioning of all titles, where every /// title has exactly one kind. /// /// This type has a `FromStr` implementation that permits parsing a string /// containing a title kind into this type. Note that parsing a title kind /// recognizes all forms present in the IMDb data, and also addition common /// sense forms. For example, `tvshow` and `tvSeries` are both accepted as /// terms for the `TVSeries` variant. #[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] #[allow(missing_docs)] pub enum TitleKind { #[serde(rename = "movie")] Movie, #[serde(rename = "short")] Short, #[serde(rename = "tvEpisode")] TVEpisode, #[serde(rename = "tvMiniSeries")] TVMiniSeries, #[serde(rename = "tvMovie")] TVMovie, #[serde(rename = "tvSeries")] TVSeries, #[serde(rename = "tvShort")] TVShort, #[serde(rename = "tvSpecial")] TVSpecial, #[serde(rename = "video")] Video, #[serde(rename = "videoGame")] VideoGame, } impl TitleKind { /// Return a string representation of this title kind. /// /// This string representation is intended to be the same string /// representation used in the IMDb data files. pub fn as_str(&self) -> &'static str { use self::TitleKind::*; match *self { Movie => "movie", Short => "short", TVEpisode => "tvEpisode", TVMiniSeries => "tvMiniSeries", TVMovie => "tvMovie", TVSeries => "tvSeries", TVShort => "tvShort", TVSpecial => "tvSpecial", Video => "video", VideoGame => "videoGame", } } /// Returns true if and only if this kind represents a TV series. pub fn is_tv_series(&self) -> bool { use self::TitleKind::*; match *self { TVMiniSeries | TVSeries => true, _ => false, } } } impl fmt::Display for TitleKind { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.as_str()) } } impl Ord for TitleKind { fn cmp(&self, other: &TitleKind) -> cmp::Ordering { self.as_str().cmp(other.as_str()) } } impl PartialOrd for TitleKind { fn partial_cmp(&self, other: &TitleKind) -> Option { Some(self.cmp(other)) } } impl FromStr for TitleKind { type Err = Error; fn from_str(ty: &str) -> Result { use self::TitleKind::*; match &*ty.to_lowercase() { "movie" => Ok(Movie), "short" => Ok(Short), "tvepisode" | "episode" => Ok(TVEpisode), "tvminiseries" | "miniseries" => Ok(TVMiniSeries), "tvmovie" => Ok(TVMovie), "tvseries" | "tvshow" | "show" => Ok(TVSeries), "tvshort" => Ok(TVShort), "tvspecial" | "special" => Ok(TVSpecial), "video" => Ok(Video), "videogame" | "game" => Ok(VideoGame), unk => Err(Error::unknown_title(unk)), } } } /// A single alternate name. /// /// Every title has one or more names, and zero or more alternate names. To /// represent multiple names, AKA or "also known as" records are provided. /// There may be many AKA records for a single title. #[derive(Clone, Debug, Deserialize)] pub struct AKA { /// The IMDb identifier that these AKA records describe. #[serde(rename = "titleId")] pub id: String, /// The order in which an AKA record should be preferred. #[serde(rename = "ordering")] pub order: i32, /// The alternate name. #[serde(rename = "title")] pub title: String, /// A geographic region in which this alternate name applies. #[serde(rename = "region")] pub region: String, /// The language of this alternate name. #[serde(rename = "language")] pub language: String, /// A comma separated list of types for this name. #[serde(rename = "types")] pub types: String, /// A comma separated list of attributes for this name. #[serde(rename = "attributes")] pub attributes: String, /// A flag indicating whether this corresponds to the original title or /// not. #[serde( rename = "isOriginalTitle", deserialize_with = "optional_number_as_bool" )] pub is_original_title: Option, } /// A single episode record. /// /// An episode record is an entry that joins two title records together, and /// provides episode specific information, such as the season and episode /// number. The two title records joined correspond to the title record for the /// TV show and the title record for the episode. #[derive(Clone, Debug, Deserialize)] pub struct Episode { /// The IMDb title identifier for this episode. #[serde(rename = "tconst")] pub id: String, /// The IMDb title identifier for the parent TV show of this episode. #[serde(rename = "parentTconst")] pub tvshow_id: String, /// The season in which this episode is contained, if it exists. #[serde( rename = "seasonNumber", deserialize_with = "csv::invalid_option" )] pub season: Option, /// The episode number of the season in which this episode is contained, if /// it exists. #[serde( rename = "episodeNumber", deserialize_with = "csv::invalid_option" )] pub episode: Option, } /// A rating associated with a single title record. #[derive(Clone, Debug, Deserialize)] pub struct Rating { /// The IMDb title identifier for this rating. #[serde(rename = "tconst")] pub id: String, /// The rating, on a scale of 0 to 10, for this title. #[serde(rename = "averageRating")] pub rating: f32, /// The number of votes involved in this rating. #[serde(rename = "numVotes")] pub votes: u32, } fn number_as_bool<'de, D>(de: D) -> Result where D: Deserializer<'de>, { i32::deserialize(de).map(|n| n != 0) } fn optional_number_as_bool<'de, D>(de: D) -> Result, D::Error> where D: Deserializer<'de>, { Ok(i32::deserialize(de).map(|n| Some(n != 0)).unwrap_or(None)) } ================================================ FILE: imdb-index/src/scored.rs ================================================ use std::cmp; use std::collections::BinaryHeap; use std::num::FpCategory; use std::vec; /// A collection of scored values, sorted in descending order by score. #[derive(Clone, Debug, Default)] pub struct SearchResults(Vec>); impl SearchResults { /// Create an empty collection of scored values. pub fn new() -> SearchResults { SearchResults(vec![]) } /// Create a collection of search results from a min-heap of scored values. pub fn from_min_heap( queue: &mut BinaryHeap>>, ) -> SearchResults { let mut results = vec![]; while let Some(x) = queue.pop() { results.push(x.0); } results.reverse(); SearchResults(results) } /// Add a new scored value to this collection. /// /// The score provided must be less than or equal to every other score in /// this collection, otherwise this method will panic. pub fn push(&mut self, scored: Scored) { assert!(self.0.last().map_or(true, |smallest| &scored <= smallest)); self.0.push(scored); } /// Normalizes the scores in this collection such that all scores are in /// the range `[0, 1]` where the top result always has score `1.0`. /// /// This operation is idempotent and does not change the ordering of /// results. pub fn normalize(&mut self) { if let Some(top_score) = self.0.get(0).map(|s| s.score()) { // The minimal score is 0, so if the top score is 0, then all // scores must be 0. No normalization needed. (And we avoid a // divide-by-zero below.) if top_score.classify() == FpCategory::Zero { return; } for result in &mut self.0 { let score = result.score(); result.set_score(score / top_score); } } } /// Recomputes the scores in this collection using the given function. /// /// The results are then re-sorted according to the new scores. pub fn rescore f64>(&mut self, mut rescore: F) { for result in &mut self.0 { let score = rescore(result.value()); result.set_score(score); } self.0.sort_by(|s1, s2| s1.cmp(&s2).reverse()); } /// Trim this collection so that it contains at most the first `size` /// results. pub fn trim(&mut self, size: usize) { if self.0.len() > size { self.0.drain(size..); } } /// Returns the number of results in this collection. pub fn len(&self) -> usize { self.0.len() } /// Returns true if and only if this collection is empty. pub fn is_empty(&self) -> bool { self.0.is_empty() } /// Return a slice of search results in order. pub fn as_slice(&self) -> &[Scored] { &self.0 } /// Consume this collection and return the underlying sorted sequence of /// scored values. pub fn into_vec(self) -> Vec> { self.0 } } impl IntoIterator for SearchResults { type IntoIter = vec::IntoIter>; type Item = Scored; fn into_iter(self) -> vec::IntoIter> { self.into_vec().into_iter() } } /// Any value associated with a score. /// /// We define Eq and Ord on this type in a way that ignores `value` and only /// uses the `score` to determine ordering. The public API of `Scored` /// guarantees that scores are never `NaN`. #[derive(Clone, Copy, Debug)] pub struct Scored { score: f64, value: T, } impl Scored { /// Create a new value `T` with a score of `1.0`. pub fn new(value: T) -> Scored { Scored { score: 1.0, value } } /// Return the score for this item. /// /// In general, no restrictions are placed on the range of scores, however /// most search APIs that use it will return scores in the range `[0, 1]`. /// /// The score returned is guaranteed to never be `NaN`. pub fn score(&self) -> f64 { self.score } /// Set the score, replacing the existing value with the given value. /// /// This panics if the given score is `NaN`. pub fn set_score(&mut self, score: f64) { assert!(score.is_finite()); self.score = score; } /// Consume this scored value and return a new scored value that drops the /// existing score and replaces it with the given score. /// /// This panics if the given score is `NaN`. pub fn with_score(mut self, score: f64) -> Scored { self.set_score(score); self } /// Consume this scored value and map its value using the function given, /// returning a new scored value with the result of the map and an /// unchanged score. pub fn map U>(self, f: F) -> Scored { Scored { score: self.score, value: f(self.value) } } /// Consume this scored value and map its score using the function given, /// return a new `Scored` with an unchanged value. /// /// This panics if score returned by `f` is `NaN`. pub fn map_score f64>(self, f: F) -> Scored { let score = f(self.score); self.with_score(score) } /// Return a reference to the underlying value. pub fn value(&self) -> &T { &self.value } /// Consume this scored value, drop the score and return the underlying /// `T`. pub fn into_value(self) -> T { self.value } /// Consume this scored value and return the underlying pair of score and /// `T`. pub fn into_pair(self) -> (f64, T) { (self.score, self.value) } } impl Default for Scored { fn default() -> Scored { Scored::new(T::default()) } } impl Eq for Scored {} impl PartialEq for Scored { fn eq(&self, other: &Scored) -> bool { let (s1, s2) = (self.score, other.score); s1 == s2 } } impl Ord for Scored { fn cmp(&self, other: &Scored) -> cmp::Ordering { self.score.partial_cmp(&other.score).unwrap() } } impl PartialOrd for Scored { fn partial_cmp(&self, other: &Scored) -> Option { Some(self.cmp(other)) } } #[cfg(test)] mod tests { use super::Scored; use std::f64::NAN; #[test] #[should_panic] fn never_nan_1() { Scored::new(()).set_score(NAN); } #[test] #[should_panic] fn never_nan_2() { Scored::new(()).with_score(NAN); } #[test] #[should_panic] fn never_nan_3() { Scored::new(()).map_score(|_| NAN); } } ================================================ FILE: imdb-index/src/search.rs ================================================ use std::cmp; use std::f64; use std::fmt; use std::result; use std::str::FromStr; use lazy_static::lazy_static; use regex::Regex; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::error::{Error, Result}; use crate::index::{Index, MediaEntity, NameQuery, NameScorer}; use crate::record::{Episode, Rating, Title, TitleKind}; use crate::scored::{Scored, SearchResults}; use crate::util::{csv_file, IMDB_BASICS}; /// A handle that permits searching IMDb media records with relevance ranking. /// /// A searcher is constructed by providing it a handle to an IMDb /// [`Index`](struct.Index.html). The `Index` is responsible for managing the /// lower level data access, while the `Searcher` provides high level routines /// for ranking results. /// /// The primary interface to a `Searcher` is its `search` method, which takes /// as input a [`Query`](struct.Query.html) and returns a ranked list of /// [`MediaEntity`](struct.MediaEntity.html) as output. #[derive(Debug)] pub struct Searcher { idx: Index, } impl Searcher { /// Create a new searcher for the given `Index`. /// /// A single searcher can be used to execute many queries. /// /// An existing `Index` can be opened with `Index::open`, and a new `Index` /// can be created with `Index::create`. pub fn new(idx: Index) -> Searcher { Searcher { idx } } /// Execute a search with the given `Query`. /// /// Generally, the results returned are ranked in relevance order, where /// each result has a score associated with it. The score is between /// `0` and `1.0` (inclusive), where a score of `1.0` means "most similar" /// and a score of `0` means "least similar." /// /// Depending on the query, the behavior of search can vary: /// /// * When the query specifies a similarity function, then the results are /// ranked by that function. /// * When the query contains a name to search by and a name scorer, then /// results are ranked by the name scorer. If the query specifies a /// similarity function, then results are first ranked by the name /// scorer, and then re-ranked by the similarity function. /// * When no name or no name scorer are specified by the query, then /// this search will do a (slow) exhaustive search over all media records /// in IMDb. As a special case, if the query contains a TV show ID, then /// only records in that TV show are searched, and this is generally /// fast. /// * If the query is empty, then no results are returned. /// /// If there was a problem reading the underlying index or the IMDb data, /// then an error is returned. pub fn search( &mut self, query: &Query, ) -> Result> { if query.is_empty() { return Ok(SearchResults::new()); } let mut results = match query.name_query() { None => self.search_exhaustive(query)?, Some(nameq) => self.search_with_name(query, &nameq)?, }; results.trim(query.size); results.normalize(); Ok(results) } /// Return a mutable reference to the underlying index for this searcher. pub fn index(&mut self) -> &mut Index { &mut self.idx } fn search_with_name( &mut self, query: &Query, name_query: &NameQuery, ) -> Result> { let mut results = SearchResults::new(); for r in self.idx.search(name_query)? { if query.similarity.is_none() && results.len() >= query.size { break; } let (score, title) = r.into_pair(); let entity = self.idx.entity_from_title(title)?; if query.matches(&entity) { results.push(Scored::new(entity).with_score(score)); } } if !query.similarity.is_none() { results.rescore(|e| self.similarity(query, &e.title().title)); } Ok(results) } fn search_exhaustive( &mut self, query: &Query, ) -> Result> { if let Some(ref tvshow_id) = query.tvshow_id { return self.search_with_tvshow(query, tvshow_id); } let mut rdr = csv_file(self.idx.data_dir().join(IMDB_BASICS))?; if !query.has_filters() { let mut nresults = SearchResults::new(); let mut record = csv::StringRecord::new(); while rdr.read_record(&mut record).map_err(Error::csv)? { let id_title = (record[0].to_string(), record[2].to_string()); nresults.push(Scored::new(id_title)); } nresults.rescore(|t| self.similarity(query, &t.1)); let mut results = SearchResults::new(); for nresult in nresults.into_vec().into_iter().take(query.size) { let (score, (id, _)) = nresult.into_pair(); let entity = match self.idx.entity(&id)? { None => continue, Some(entity) => entity, }; results.push(Scored::new(entity).with_score(score)); } Ok(results) } else if query.needs_only_title() { let mut tresults = SearchResults::new(); for result in rdr.deserialize() { let title: Title = result.map_err(Error::csv)?; if query.matches_title(&title) { tresults.push(Scored::new(title)); } } tresults.rescore(|t| self.similarity(query, &t.title)); let mut results = SearchResults::new(); for tresult in tresults.into_vec().into_iter().take(query.size) { let (score, title) = tresult.into_pair(); let entity = self.idx.entity_from_title(title)?; results.push(Scored::new(entity).with_score(score)); } Ok(results) } else { let mut results = SearchResults::new(); for result in rdr.deserialize() { let title = result.map_err(Error::csv)?; let entity = self.idx.entity_from_title(title)?; if query.matches(&entity) { results.push(Scored::new(entity)); } } results.rescore(|e| self.similarity(query, &e.title().title)); Ok(results) } } fn search_with_tvshow( &mut self, query: &Query, tvshow_id: &str, ) -> Result> { let mut results = SearchResults::new(); for ep in self.idx.seasons(tvshow_id)? { let entity = match self.idx.entity(&ep.id)? { None => continue, Some(entity) => entity, }; if query.matches(&entity) { results.push(Scored::new(entity)); } } if !query.similarity.is_none() { results.rescore(|e| self.similarity(query, &e.title().title)); } Ok(results) } fn similarity(&self, query: &Query, name: &str) -> f64 { match query.name { None => 0.0, Some(ref qname) => query.similarity.similarity(qname, name), } } } /// A query that can be used to search IMDb media records. /// /// A query typically consists of a fuzzy name query along with zero or more /// filters. If a query lacks a fuzzy name query, then this will generally /// result in an exhaustive search of all IMDb media records, which can be /// slow. /// /// Filters are matched conjunctively. That is, a search result must satisfy /// every filter on a query to match. /// /// Empty queries always return no results. /// /// The `Serialize` and `Deserialize` implementations for this type use the /// free-form query syntax. #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub struct Query { name: Option, name_scorer: Option, similarity: Similarity, size: usize, kinds: Vec, year: Range, votes: Range, season: Range, episode: Range, tvshow_id: Option, } impl Default for Query { fn default() -> Query { Query::new() } } impl Query { /// Create a new empty query. pub fn new() -> Query { Query { name: None, name_scorer: Some(NameScorer::default()), similarity: Similarity::default(), size: 30, kinds: vec![], year: Range::none(), votes: Range::none(), season: Range::none(), episode: Range::none(), tvshow_id: None, } } /// Return true if and only if this query is empty. /// /// Searching with an empty query always yields no results. pub fn is_empty(&self) -> bool { self.name.as_ref().map_or(true, |n| n.is_empty()) && self.kinds.is_empty() && self.year.is_none() && self.votes.is_none() && self.season.is_none() && self.episode.is_none() && self.tvshow_id.is_none() } /// Set the name to query by. /// /// The name given here is normalized and broken down into components /// automatically to facilitate fuzzy searching. /// /// Note that if no name is provided in a query, then it is possible that /// searching with the query will require exhaustively looking at every /// record in IMDb. This will be slower. pub fn name(mut self, name: &str) -> Query { self.name = Some(name.to_string()); self } /// Set the scorer to use for name searches. /// /// The name scorer is used to rank results from searching the IMDb name /// index. If no name query is given, then this scorer is not used. /// /// If `None` is provided here, then the name index will not be used. This /// will likely cause an exhaustive search of all IMDb records, which can /// be slow. The use case for providing a name query without a name scorer /// is if you, for example, wanted to rank all of the records in IMDb /// by the Levenshtein distance between your query and every other record /// in IMDb. Normally, when the name index is used, only the (small number) /// of results returned by searching the name are ranked. Typically, these /// sorts of queries are useful for evaluation purposes, but not much else. pub fn name_scorer(mut self, scorer: Option) -> Query { self.name_scorer = scorer; self } /// Set the similarity function. /// /// The similarity function can be selected from a predefined set of /// choices defined by the /// [`Similarity`](enum.Similarity.html) type. /// /// When a similarity function is used, then any results from searching /// the name index are re-ranked according to their similarity with the /// query. /// /// By default, no similarity function is used. pub fn similarity(mut self, sim: Similarity) -> Query { self.similarity = sim; self } /// Set the maximum number of results to be returned by a search. /// /// Note that setting this number too high (e.g., `> 10,000`) can impact /// performance. This is a normal restriction found in most information /// retrieval systems. That is, deep paging through result sets is /// expensive. pub fn size(mut self, size: usize) -> Query { self.size = size; self } /// Add a title kind to filter by. /// /// Multiple title kinds can be added to query, and search results must /// match at least one of them. /// /// Note that it is not possible to remove title kinds from an existing /// query. Instead, build a new query from scratch. pub fn kind(mut self, kind: TitleKind) -> Query { if !self.kinds.contains(&kind) { self.kinds.push(kind); } self } /// Set the lower inclusive bound on a title's year. /// /// This applies to either the title's start or end years. pub fn year_ge(mut self, year: u32) -> Query { self.year.start = Some(year); self } /// Set the upper inclusive bound on a title's year. /// /// This applies to either the title's start or end years. pub fn year_le(mut self, year: u32) -> Query { self.year.end = Some(year); self } /// Set the lower inclusive bound on a title's number of votes. pub fn votes_ge(mut self, votes: u32) -> Query { self.votes.start = Some(votes); self } /// Set the upper inclusive bound on a title's number of votes. pub fn votes_le(mut self, votes: u32) -> Query { self.votes.end = Some(votes); self } /// Set the lower inclusive bound on a title's season. /// /// This automatically limits all results to episodes. pub fn season_ge(mut self, season: u32) -> Query { self.season.start = Some(season); self } /// Set the upper inclusive bound on a title's season. /// /// This automatically limits all results to episodes. pub fn season_le(mut self, season: u32) -> Query { self.season.end = Some(season); self } /// Set the lower inclusive bound on a title's episode number. /// /// This automatically limits all results to episodes. pub fn episode_ge(mut self, episode: u32) -> Query { self.episode.start = Some(episode); self } /// Set the upper inclusive bound on a title's episode number. /// /// This automatically limits all results to episodes. pub fn episode_le(mut self, episode: u32) -> Query { self.episode.end = Some(episode); self } /// Restrict results to episodes belonging to the TV show given by its /// IMDb ID. /// /// This automatically limits all results to episodes. pub fn tvshow_id(mut self, tvshow_id: &str) -> Query { self.tvshow_id = Some(tvshow_id.to_string()); self } /// Returns true if and only if the given entity matches this query. /// /// Note that this only applies filters in this query. e.g., The name /// aspect of the query, if one exists, is ignored. fn matches(&self, ent: &MediaEntity) -> bool { self.matches_title(&ent.title()) && self.matches_rating(ent.rating()) && self.matches_episode(ent.episode()) } /// Returns true if and only if the given title matches this query. /// /// This ignores non-title filters. fn matches_title(&self, title: &Title) -> bool { if !self.kinds.is_empty() && !self.kinds.contains(&title.kind) { return false; } if !self.year.contains(title.start_year.as_ref()) && !self.year.contains(title.end_year.as_ref()) { return false; } true } /// Returns true if and only if the given rating matches this query. /// /// This ignores non-rating filters. /// /// If a rating filter is present and `None` is given, then this always /// returns `false`. fn matches_rating(&self, rating: Option<&Rating>) -> bool { if !self.votes.contains(rating.map(|r| &r.votes)) { return false; } true } /// Returns true if and only if the given episode matches this query. /// /// This ignores non-episode filters. /// /// If an episode filter is present and `None` is given, then this always /// returns `false`. fn matches_episode(&self, ep: Option<&Episode>) -> bool { if !self.season.contains(ep.and_then(|e| e.season.as_ref())) { return false; } if !self.episode.contains(ep.and_then(|e| e.episode.as_ref())) { return false; } if let Some(ref tvshow_id) = self.tvshow_id { if ep.map_or(true, |e| tvshow_id != &e.tvshow_id) { return false; } } true } /// Build a name query suitable for this query. /// /// The name query returned may request many more results than the result /// size maximum on this query. fn name_query(&self) -> Option { let name = match self.name.as_ref() { None => return None, Some(name) => &**name, }; let scorer = match self.name_scorer { None => return None, Some(scorer) => scorer, }; // We want our name query to return a healthy set of results, even if // it's well beyond the result set size requested by the user. This is // primarily because a name search doesn't incorporate filters itself, // which simplifies the implementation. Therefore, we need to request // more results than what we need in case our filter is aggressive. let size = cmp::max(1000, self.size); Some(NameQuery::new(name).with_size(size).with_scorer(scorer)) } /// Returns true if and only if this query has any filters. /// /// When a query lacks filters, then the result set can be completely /// determined by searching the name index and applying a similarity /// function, if present. This can make exhaustive searches, particularly /// the ones used during an evaluation, a bit faster. fn has_filters(&self) -> bool { self.needs_rating() || self.needs_episode() || !self.kinds.is_empty() || !self.year.is_none() } /// Returns true if and only this query has only title filters. /// /// When true, this can make exhaustive searches faster by avoiding the /// need to fetch the rating and/or episode for every title in IMDb. fn needs_only_title(&self) -> bool { !self.needs_rating() && !self.needs_episode() } /// Returns true if and only if this query has a rating filter. fn needs_rating(&self) -> bool { !self.votes.is_none() } /// Returns true if and only if this query has an episode filter. fn needs_episode(&self) -> bool { !self.season.is_none() || !self.episode.is_none() || !self.tvshow_id.is_none() } } impl Serialize for Query { fn serialize(&self, s: S) -> result::Result where S: Serializer, { s.serialize_str(&self.to_string()) } } impl<'a> Deserialize<'a> for Query { fn deserialize(d: D) -> result::Result where D: Deserializer<'a>, { use serde::de::Error; let querystr = String::deserialize(d)?; querystr .parse() .map_err(|e: self::Error| D::Error::custom(e.to_string())) } } impl FromStr for Query { type Err = Error; fn from_str(qstr: &str) -> Result { lazy_static! { // The 'directive', 'terms' and 'space' groups are all mutually // exclusive. When 'directive' matches, we parse it using DIRECTIVE // in a subsequent step. When 'terms' matches, we add them to the // name query. Then 'space' matches, we ignore it. static ref PARTS: Regex = Regex::new( r"\{(?P[^}]+)\}|(?P[^{}\s]+)|(?P\s+)" ).unwrap(); // Parse a directive of the form '{name:val}' or '{kind}'. static ref DIRECTIVE: Regex = Regex::new( r"^(?:(?P[^:]+):(?P.+)|(?P.+))$" ).unwrap(); } let mut terms = vec![]; let mut q = Query::new(); for caps in PARTS.captures_iter(qstr) { if caps.name("space").is_some() { continue; } else if let Some(m) = caps.name("terms") { terms.push(m.as_str().to_string()); continue; } let dcaps = DIRECTIVE.captures(&caps["directive"]).unwrap(); if let Some(m) = dcaps.name("kind") { q = q.kind(m.as_str().parse()?); continue; } let (name, val) = (dcaps["name"].trim(), dcaps["val"].trim()); match name { "size" => { q.size = val.parse().map_err(Error::number)?; } "year" => { q.year = val.parse()?; } "votes" => { q.votes = val.parse()?; } "season" => { q.season = val.parse()?; } "episode" => { q.episode = val.parse()?; } "tvseries" | "tvshow" | "show" => { q.tvshow_id = Some(val.to_string()); } "sim" | "similarity" => { q.similarity = val.parse()?; } "scorer" => { if val == "none" { q.name_scorer = None; } else { q.name_scorer = Some(val.parse()?); } } unk => return Err(Error::unknown_directive(unk)), } } if !terms.is_empty() { q = q.name(&terms.join(" ")); } Ok(q) } } impl fmt::Display for Query { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.name_scorer { None => f.write_str("{scorer:none}")?, Some(ref scorer) => write!(f, "{{scorer:{}}}", scorer)?, } write!(f, " {{sim:{}}}", self.similarity)?; write!(f, " {{size:{}}}", self.size)?; let mut kinds: Vec<&TitleKind> = self.kinds.iter().collect(); kinds.sort(); for kind in kinds { write!(f, " {{{}}}", kind)?; } if !self.year.is_none() { write!(f, " {{year:{}}}", self.year)?; } if !self.votes.is_none() { write!(f, " {{votes:{}}}", self.votes)?; } if !self.season.is_none() { write!(f, " {{season:{}}}", self.season)?; } if !self.episode.is_none() { write!(f, " {{episode:{}}}", self.episode)?; } if let Some(ref tvshow_id) = self.tvshow_id { write!(f, " {{show:{}}}", tvshow_id)?; } if let Some(ref name) = self.name { write!(f, " {}", name)?; } Ok(()) } } /// A ranking function to use when searching IMDb records. /// /// A similarity ranking function computes a score between `0.0` and `1.0` (not /// including `0` but including `1.0`) for a query and a candidate result. The /// score is determined by the corresponding names for a query and a candidate, /// and a higher score indicates more similarity. /// /// This ranking function can be used to increase the precision of a set /// of results. In particular, when a similarity function is provided to /// a [`Query`](struct.Query.html), then any results returned by querying /// the IMDb name index will be rescored according to this function. If no /// similarity function is provided, then the results will be ranked according /// to scores produced by the name index. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub enum Similarity { /// Do not use a similarity function. None, /// Computes the Levenshtein edit distance between two names and converts /// it to a similarity. Levenshtein, /// Computes the Jaro edit distance between two names and converts it to a /// similarity. Jaro, /// Computes the Jaro-Winkler edit distance between two names and converts /// it to a similarity. JaroWinkler, } impl Similarity { /// Returns a list of s trings representing the possible similarity /// function names. pub fn possible_names() -> &'static [&'static str] { &["none", "levenshtein", "jaro", "jarowinkler"] } /// Returns true if and only if no similarity function was selected. pub fn is_none(&self) -> bool { *self == Similarity::None } /// Computes the similarity between the given strings according to the /// underlying similarity function. If no similarity function is present, /// then this always returns `1.0`. /// /// The returned value is always in the range `(0, 1]`. pub fn similarity(&self, q1: &str, q2: &str) -> f64 { let sim = match *self { Similarity::None => 1.0, Similarity::Levenshtein => { let distance = strsim::levenshtein(q1, q2) as f64; // We do a simple conversion of distance to similarity. This // will produce very low scores even for very similar names, // but callers may normalize scores. // // We also add `1` to the denominator to avoid division by // zero. Incidentally, this causes the similarity of identical // strings to be exactly 1.0, which is what we want. 1.0 / (1.0 + distance) } Similarity::Jaro => strsim::jaro(q1, q2), Similarity::JaroWinkler => strsim::jaro_winkler(q1, q2), }; // Don't permit a score to actually be zero. This prevents division // by zero during normalization if all results have a score of zero. if sim < f64::EPSILON { f64::EPSILON } else { sim } } } impl Default for Similarity { fn default() -> Similarity { Similarity::None } } impl fmt::Display for Similarity { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { Similarity::None => write!(f, "none"), Similarity::Levenshtein => write!(f, "levenshtein"), Similarity::Jaro => write!(f, "jaro"), Similarity::JaroWinkler => write!(f, "jarowinkler"), } } } impl FromStr for Similarity { type Err = Error; fn from_str(s: &str) -> Result { match s { "none" => Ok(Similarity::None), "levenshtein" => Ok(Similarity::Levenshtein), "jaro" => Ok(Similarity::Jaro), "jarowinkler" | "jaro-winkler" => Ok(Similarity::JaroWinkler), unk => Err(Error::unknown_sim(unk)), } } } /// A range filter over any partially ordered type `T`. /// /// This type permits either end of the range to be unbounded. #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] struct Range { start: Option, end: Option, } impl Range { pub fn none() -> Range { Range { start: None, end: None } } pub fn is_none(&self) -> bool { self.start.is_none() && self.end.is_none() } } impl Range { pub fn contains(&self, t: Option<&T>) -> bool { let t = match t { None => return self.is_none(), Some(t) => t, }; match (&self.start, &self.end) { (&None, &None) => true, (&Some(ref s), &None) => s <= t, (&None, &Some(ref e)) => t <= e, (&Some(ref s), &Some(ref e)) => s <= t && t <= e, } } } impl fmt::Display for Range { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match (&self.start, &self.end) { (&None, &None) => write!(f, "-"), (&Some(ref s), &None) => write!(f, "{}-", s), (&None, &Some(ref e)) => write!(f, "-{}", e), (&Some(ref s), &Some(ref e)) if s == e => write!(f, "{}", s), (&Some(ref s), &Some(ref e)) => write!(f, "{}-{}", s, e), } } } impl> FromStr for Range { type Err = Error; fn from_str(range: &str) -> Result> { // One wonders what happens if we need to support ranges consisting // of negative numbers. Thankfully, it seems we needn't do that for // the IMDb data. let (start, end) = match range.find('-') { None => { // For no particular reason, parse it twice so that we don't // need a `Clone` bound. let start = range.parse().map_err(Error::number)?; let end = range.parse().map_err(Error::number)?; return Ok(Range { start: Some(start), end: Some(end) }); } Some(i) => { let (start, end) = range.split_at(i); (start.trim(), end[1..].trim()) } }; Ok(match (start.is_empty(), end.is_empty()) { (true, true) => Range::none(), (true, false) => Range { start: None, end: Some(end.parse().map_err(Error::number)?), }, (false, true) => Range { start: Some(start.parse().map_err(Error::number)?), end: None, }, (false, false) => Range { start: Some(start.parse().map_err(Error::number)?), end: Some(end.parse().map_err(Error::number)?), }, }) } } #[cfg(test)] mod tests { use super::*; #[test] fn ranges() { let r: Range = "5-10".parse().unwrap(); assert_eq!(r, Range { start: Some(5), end: Some(10) }); let r: Range = "5-".parse().unwrap(); assert_eq!(r, Range { start: Some(5), end: None }); let r: Range = "-10".parse().unwrap(); assert_eq!(r, Range { start: None, end: Some(10) }); let r: Range = "5-5".parse().unwrap(); assert_eq!(r, Range { start: Some(5), end: Some(5) }); let r: Range = "5".parse().unwrap(); assert_eq!(r, Range { start: Some(5), end: Some(5) }); } #[test] fn query_parser() { let q: Query = "foo bar baz".parse().unwrap(); assert_eq!(q, Query::new().name("foo bar baz")); let q: Query = "{movie}".parse().unwrap(); assert_eq!(q, Query::new().kind(TitleKind::Movie)); let q: Query = "{movie} {tvshow}".parse().unwrap(); assert_eq!( q, Query::new().kind(TitleKind::Movie).kind(TitleKind::TVSeries) ); let q: Query = "{movie}{tvshow}".parse().unwrap(); assert_eq!( q, Query::new().kind(TitleKind::Movie).kind(TitleKind::TVSeries) ); let q: Query = "foo {movie} bar {tvshow} baz".parse().unwrap(); assert_eq!( q, Query::new() .name("foo bar baz") .kind(TitleKind::Movie) .kind(TitleKind::TVSeries) ); let q: Query = "{size:5}".parse().unwrap(); assert_eq!(q, Query::new().size(5)); let q: Query = "{ size : 5 }".parse().unwrap(); assert_eq!(q, Query::new().size(5)); let q: Query = "{year:1990}".parse().unwrap(); assert_eq!(q, Query::new().year_ge(1990).year_le(1990)); let q: Query = "{year:1990-}".parse().unwrap(); assert_eq!(q, Query::new().year_ge(1990)); let q: Query = "{year:-1990}".parse().unwrap(); assert_eq!(q, Query::new().year_le(1990)); let q: Query = "{year:-}".parse().unwrap(); assert_eq!(q, Query::new()); } #[test] fn query_parser_error() { assert!("{blah}".parse::().is_err()); assert!("{size:a}".parse::().is_err()); assert!("{year:}".parse::().is_err()); } #[test] fn query_parser_weird() { let q: Query = "{movie".parse().unwrap(); assert_eq!(q, Query::new().name("movie")); let q: Query = "movie}".parse().unwrap(); assert_eq!(q, Query::new().name("movie")); } #[test] fn query_display() { let q = Query::new() .name("foo bar baz") .size(31) .season_ge(4) .season_le(5) .kind(TitleKind::TVSeries) .kind(TitleKind::Movie) .similarity(Similarity::Jaro); let expected = "{scorer:okapibm25} {sim:jaro} {size:31} {movie} {tvSeries} {season:4-5} foo bar baz"; assert_eq!(q.to_string(), expected); } #[test] fn query_serialize() { #[derive(Serialize)] struct Test { query: Query, } let query = Query::new() .name("foo bar baz") .name_scorer(None) .size(31) .season_ge(4) .season_le(4); let got = serde_json::to_string(&Test { query }).unwrap(); let expected = r#"{"query":"{scorer:none} {sim:none} {size:31} {season:4} foo bar baz"}"#; assert_eq!(got, expected); } #[test] fn query_deserialize() { let json = r#"{"query": "foo {size:30} bar {season:4} baz {show}"}"#; let expected = "{size:30} {season:4} {show} foo bar baz".parse().unwrap(); #[derive(Deserialize)] struct Test { query: Query, } let got: Test = serde_json::from_str(json).unwrap(); assert_eq!(got.query, expected); } } ================================================ FILE: imdb-index/src/util.rs ================================================ use std::fmt; use std::fs::File; use std::io; use std::path::Path; use std::time; use memmap::Mmap; use crate::error::{Error, ErrorKind, Result}; /// The TSV file in the IMDb dataset that defines the canonical set of titles /// available to us. Each record contains basic information about a title, /// such as its IMDb identifier (e.g., `tt0096697`), primary title, start year /// and type. This includes movies, TV shows, episodes and more. pub const IMDB_BASICS: &str = "title.basics.tsv"; /// The TSV file in the IMDb dataset that defines alternate names for some of /// the titles found in IMDB_BASICS. This includes, but is not limited to, /// titles in different languages. This file uses the IMDb identifier as a /// foreign key. pub const IMDB_AKAS: &str = "title.akas.tsv"; /// The TSV file in the IMDb dataset that defines the season and episode /// numbers for episodes in TV shows. Each record in this file corresponds to /// a single episode. There are four columns: the first is the IMDb identifier /// for the episode. The second is the IMDb identifier for the corresponding /// TV show. The last two columns are the season and episode numbers. Both of /// the IMDb identifiers are foreign keys that join the record to IMDB_BASICS. pub const IMDB_EPISODE: &str = "title.episode.tsv"; /// The TSV file in the IMDb dataset that provides ratings for titles in /// IMDB_BASICS. Each title has at most one rating, and a rating corresponds /// to a rank (a decimal in the range 0-10) and the number of votes involved /// in creating that rating (from the IMDb web site, presumably). pub const IMDB_RATINGS: &str = "title.ratings.tsv"; /// A type that provides a Display impl for std::time::Duration. #[derive(Debug)] pub struct NiceDuration(pub time::Duration); impl fmt::Display for NiceDuration { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{:0.4} secs", self.fractional_seconds()) } } impl NiceDuration { /// Create a duration corresponding to the amount of time since the /// instant given. pub fn since(t: time::Instant) -> NiceDuration { NiceDuration(time::Instant::now().duration_since(t)) } /// Returns the number of seconds in this duration in fraction form. /// The number to the left of the decimal point is the number of seconds, /// and the number to the right is the number of milliseconds. pub fn fractional_seconds(&self) -> f64 { let fractional = (self.0.subsec_nanos() as f64) / 1_000_000_000.0; self.0.as_secs() as f64 + fractional } } /// A function for creating a CSV reader builder that is pre-loaded with the /// correct settings for reading all IMDb CSV files. pub fn csv_reader_builder() -> csv::ReaderBuilder { let mut builder = csv::ReaderBuilder::new(); builder.has_headers(true).delimiter(b'\t').quoting(false); builder } /// Builds a CSV reader (using `csv_reader_builder`) that is backed by a /// seekable memory map. /// /// We use memory maps for this even though we could use a normal `File`, which /// is also seekable, because seeking a memory map has very little overhead. /// Seeking a `File`, on the other hand, requires a syscall. pub unsafe fn csv_mmap>( path: P, ) -> Result>> { let mmap = mmap_file(path)?; Ok(csv_reader_builder().from_reader(io::Cursor::new(mmap))) } /// Builds a CSV reader (using `csv_reader_builder`) that is backed by a file. /// While this read can be seeked, it will be less efficient than using a /// memory map. Therefore, this is useful for reading CSV data when no seeking /// is needed. pub fn csv_file>(path: P) -> Result> { let path = path.as_ref(); let rdr = csv_reader_builder().from_path(path).map_err(|e| { Error::new(ErrorKind::Csv(format!("{}: {}", path.display(), e))) })?; Ok(rdr) } /// Builds a file-backed memory map. pub unsafe fn mmap_file>(path: P) -> Result { let path = path.as_ref(); let file = open_file(path)?; let mmap = Mmap::map(&file).map_err(|e| Error::io_path(e, path))?; Ok(mmap) } /// Creates a file and truncates it. pub fn create_file>(path: P) -> Result { let path = path.as_ref(); let file = File::create(path).map_err(|e| Error::io_path(e, path))?; Ok(file) } /// Opens a file for reading. pub fn open_file>(path: P) -> Result { let path = path.as_ref(); let file = File::open(path).map_err(|e| Error::io_path(e, path))?; Ok(file) } /// Creates an FST set builder for the given file path. pub fn fst_set_builder_file>( path: P, ) -> Result>> { let path = path.as_ref(); let wtr = io::BufWriter::new(create_file(path)?); let builder = fst::SetBuilder::new(wtr).map_err(|e| { Error::new(ErrorKind::Fst(format!("{}: {}", path.display(), e))) })?; Ok(builder) } /// Open an FST set file for the given file path as a memory map. pub unsafe fn fst_set_file>(path: P) -> Result> { let path = path.as_ref(); let file = File::open(path).map_err(|e| Error::io_path(e, path))?; let mmap = Mmap::map(&file).map_err(|e| Error::io_path(e, path))?; let set = fst::Set::new(mmap).map_err(|e| { Error::new(ErrorKind::Fst(format!("{}: {}", path.display(), e))) })?; Ok(set) } /// Creates an FST map builder for the given file path. pub fn fst_map_builder_file>( path: P, ) -> Result>> { let path = path.as_ref(); let wtr = io::BufWriter::new(create_file(path)?); let builder = fst::MapBuilder::new(wtr).map_err(|e| { Error::new(ErrorKind::Fst(format!("{}: {}", path.display(), e))) })?; Ok(builder) } /// Open an FST map file for the given file path as a memory map. pub unsafe fn fst_map_file>(path: P) -> Result> { let path = path.as_ref(); let file = File::open(path).map_err(|e| Error::io_path(e, path))?; let mmap = Mmap::map(&file).map_err(|e| Error::io_path(e, path))?; let map = fst::Map::new(mmap).map_err(|e| { Error::new(ErrorKind::Fst(format!("{}: {}", path.display(), e))) })?; Ok(map) } ================================================ FILE: rustfmt.toml ================================================ max_width = 79 use_small_heuristics = "max" ================================================ FILE: src/download.rs ================================================ use std::fs::{self, File}; use std::io; use std::path::{Path, PathBuf}; use {anyhow::Context, flate2::read::GzDecoder}; /// The base URL to the IMDb data set. /// /// It's not clear if this URL will remain free and open forever, although it /// is provided by IMDb proper. If this goes away, we'll need to switch to s3. const IMDB_BASE_URL: &'static str = "https://datasets.imdbws.com"; /// All of the data sets we care about. /// /// We leave out cast/crew because we don't need them for renaming files. const DATA_SETS: &'static [&'static str] = &[ "title.akas.tsv.gz", "title.basics.tsv.gz", "title.episode.tsv.gz", "title.ratings.tsv.gz", ]; /// Download ensures that all of the IMDb data files exist and have non-zero /// size in the given directory. Any path that does not meet these criteria /// is fetched from IMDb. Other paths are left untouched. /// /// Returns true if and only if at least one file was downloaded. pub fn download_all>(dir: P) -> anyhow::Result { let dir = dir.as_ref(); fs::create_dir_all(dir)?; let nonexistent = non_existent_data_sets(dir)?; for dataset in &nonexistent { download_one(dir, dataset)?; } Ok(nonexistent.len() > 0) } /// Update will update all data set files, regardless of whether they already /// exist or not. pub fn update_all>(dir: P) -> anyhow::Result<()> { let dir = dir.as_ref(); fs::create_dir_all(dir)?; for dataset in DATA_SETS { download_one(dir, dataset)?; } Ok(()) } /// Downloads a single data set, decompresses it and writes it to the /// corresponding file path in the given directory. fn download_one(outdir: &Path, dataset: &'static str) -> anyhow::Result<()> { let outpath = dataset_path(outdir, dataset); let mut outfile = File::create(&outpath)?; let url = format!("{}/{}", IMDB_BASE_URL, dataset); log::info!("downloading {} to {}", url, outpath.display()); let resp = ureq::get(&url).call().context("HTTP error")?; log::info!("sorting CSV records"); write_sorted_csv_records( GzDecoder::new(resp.into_reader()), &mut outfile, )?; Ok(()) } /// Gets a list of data sets that either don't exist in the current directory /// or have zero size. fn non_existent_data_sets(dir: &Path) -> anyhow::Result> { let mut result = vec![]; for &dataset in DATA_SETS { let path = dataset_path(dir, dataset); if fs::metadata(path).map(|md| md.len() == 0).unwrap_or(true) { result.push(dataset); } } Ok(result) } /// Build the path on disk for a dataset, given the directory and the dataset /// name. fn dataset_path(dir: &Path, name: &'static str) -> PathBuf { let mut path = dir.join(name); // We drop the gz extension since we decompress before writing to disk. path.set_extension(""); path } /// Read all CSV data into memory and sort the records in lexicographic order. /// /// This is unfortunately necessary because the IMDb data is no longer sorted /// in lexicographic order with respect to the `tt` identifiers. This appears /// to be fallout as a result of adding 10 character identifiers (previously, /// only 9 character identifiers were used). fn write_sorted_csv_records( rdr: R, wtr: W, ) -> anyhow::Result<()> { use bstr::{io::BufReadExt, ByteSlice}; use std::io::Write; // We actually only sort the raw lines here instead of parsing CSV records, // since parsing into CSV records has fairly substantial memory overhead. // Since IMDb CSV data never contains a record that spans multiple lines, // this transformation is okay. let rdr = io::BufReader::new(rdr); let mut lines = rdr.byte_lines().collect::>>()?; if lines.is_empty() { anyhow::bail!("got empty CSV input"); } // Keep the header record first. lines[1..].sort_unstable(); let mut wtr = io::BufWriter::new(wtr); let mut prev = None; for (i, line) in lines.iter().enumerate() { // *sigh* ... Looks like the data downloaded is corrupt sometimes, // where there are duplicate rows. let first = match line.split_str("\t").next() { Some(first) => first, None => anyhow::bail!( "expected to find one tab-delimited field in '{:?}'", line.as_bstr(), ), }; if i > 0 && prev == Some(first) { continue; } prev = Some(first); wtr.write_all(&line)?; wtr.write_all(b"\n")?; } wtr.flush()?; Ok(()) } ================================================ FILE: src/logger.rs ================================================ // This module defines a super simple logger that works with the `log` crate. // We don't need anything fancy; just basic log levels and the ability to // print to stderr. We therefore avoid bringing in extra dependencies just // for this functionality. use log::Log; /// Initialize a simple logger. pub fn init() -> anyhow::Result<()> { Ok(Logger::init()?) } /// The simplest possible logger that logs to stderr. /// /// This logger does no filtering. Instead, it relies on the `log` crates /// filtering via its global max_level setting. #[derive(Debug)] struct Logger(()); const LOGGER: &'static Logger = &Logger(()); impl Logger { /// Create a new logger that logs to stderr and initialize it as the /// global logger. If there was a problem setting the logger, then an /// error is returned. fn init() -> std::result::Result<(), log::SetLoggerError> { log::set_logger(LOGGER) } } impl Log for Logger { fn enabled(&self, _: &log::Metadata) -> bool { // We set the log level via log::set_max_level, so we don't need to // implement filtering here. true } fn log(&self, record: &log::Record) { if !should_log(record) { return; } eprintln!("{}: {}", record.level(), record.args()); } fn flush(&self) { // We use eprintln! which is flushed on every call. } } fn should_log(record: &log::Record) -> bool { let t = record.target(); t.starts_with("imdb_rename") || t.starts_with("imdb_index") } ================================================ FILE: src/main.rs ================================================ use std::env; use std::ffi::OsStr; use std::io::{self, Write}; use std::path::PathBuf; use std::process; use imdb_index::{Index, IndexBuilder, NgramType, Searcher}; use lazy_static::lazy_static; use tabwriter::TabWriter; use walkdir::WalkDir; use crate::rename::{RenameAction, RenamerBuilder}; use crate::util::{choose, read_yesno, write_tsv}; mod download; mod logger; mod rename; mod util; fn main() { if let Err(err) = try_main() { // A pipe error occurs when the consumer of this process's output has // hung up. This is a normal event, and we should quit gracefully. if is_pipe_error(&err) { process::exit(0); } eprintln!("{:?}", err); process::exit(1); } } fn try_main() -> anyhow::Result<()> { logger::init()?; log::set_max_level(log::LevelFilter::Info); let args = Args::from_matches(&app().get_matches())?; if args.debug { log::set_max_level(log::LevelFilter::Debug); } // Forcefully update the data and re-index if requested. if args.update_data { args.download_all_update()?; args.create_index()?; return Ok(()); } // Ensure that the necessary data exists. if args.download_all()? || args.update_index { args.create_index()?; if args.update_index { return Ok(()); } } // Now ensure that the index exists. if !args.index_dir.exists() { args.create_index()?; } let mut searcher = args.searcher()?; let results = match args.query { None => None, Some(ref query) => Some(searcher.search(&query.parse()?)?), }; if args.files.is_empty() { let results = match results { None => anyhow::bail!("run with a file to rename or --query"), Some(ref results) => results, }; return write_tsv(io::stdout(), &mut searcher, results.as_slice()); } let mut builder = RenamerBuilder::new(); builder .min_votes(args.min_votes) .good_threshold(0.25) .regex_episode(&args.regex_episode) .regex_season(&args.regex_season) .regex_year(&args.regex_year); if let Some(ref results) = results { builder.force(choose(&mut searcher, results.as_slice(), 0.25)?); } let renamer = builder.build()?; let proposals = renamer.propose( &mut searcher, &args.files, args.dest_dir, args.rename_action, )?; if proposals.is_empty() { anyhow::bail!("no files to rename"); } let mut stdout = TabWriter::new(io::stdout()); for p in &proposals { writeln!(stdout, "{}\t->\t{}", p.src().display(), p.dst().display())?; } stdout.flush()?; if read_yesno(&format!( "Are you sure you want to {action} the above files? (y/n) ", action = &args.rename_action ))? { for p in &proposals { if let Err(err) = p.rename() { eprintln!("{}", err); } } } Ok(()) } #[derive(Debug)] struct Args { data_dir: PathBuf, dest_dir: Option, debug: bool, files: Vec, index_dir: PathBuf, ngram_size: usize, ngram_type: NgramType, query: Option, regex_episode: String, regex_season: String, regex_year: String, update_data: bool, update_index: bool, min_votes: u32, rename_action: RenameAction, } impl Args { fn from_matches(matches: &clap::ArgMatches) -> anyhow::Result { let files = collect_paths( matches .values_of_os("file") .map(|it| it.collect()) .unwrap_or(vec![]), matches.is_present("follow"), ); let query = matches.value_of_lossy("query").map(|q| q.into_owned()); let data_dir = matches.value_of_os("data-dir").map(PathBuf::from).unwrap(); let dest_dir = matches.value_of_os("dest-dir").map(PathBuf::from); let index_dir = matches .value_of_os("index-dir") .map(PathBuf::from) .unwrap_or(data_dir.join("index")); let regex_episode = matches.value_of_lossy("re-episode").unwrap().into_owned(); let regex_season = matches.value_of_lossy("re-season").unwrap().into_owned(); let regex_year = matches.value_of_lossy("re-year").unwrap().into_owned(); let min_votes = matches.value_of_lossy("votes").unwrap().parse()?; let rename_action = { if matches.is_present("symlink") { if !cfg!(unix) { anyhow::bail!( "--symlink currently supported only on Unix \ platforms, try hardlink (-H) instead" ); } RenameAction::Symlink } else if matches.is_present("hardlink") { RenameAction::Hardlink } else { RenameAction::Rename } }; Ok(Args { data_dir: data_dir, dest_dir: dest_dir, debug: matches.is_present("debug"), files: files, index_dir: index_dir, ngram_size: matches .value_of_lossy("ngram-size") .unwrap() .parse()?, ngram_type: matches .value_of_lossy("ngram-type") .unwrap() .parse()?, query: query, regex_episode: regex_episode, regex_season: regex_season, regex_year: regex_year, update_data: matches.is_present("update-data"), update_index: matches.is_present("update-index"), min_votes: min_votes, rename_action: rename_action, }) } fn create_index(&self) -> anyhow::Result { Ok(IndexBuilder::new() .ngram_size(self.ngram_size) .ngram_type(self.ngram_type) .create(&self.data_dir, &self.index_dir)?) } fn open_index(&self) -> anyhow::Result { Ok(Index::open(&self.data_dir, &self.index_dir)?) } fn searcher(&self) -> anyhow::Result { Ok(Searcher::new(self.open_index()?)) } fn download_all(&self) -> anyhow::Result { download::download_all(&self.data_dir) } fn download_all_update(&self) -> anyhow::Result<()> { download::update_all(&self.data_dir) } } fn app() -> clap::App<'static, 'static> { use clap::{App, AppSettings, Arg}; lazy_static! { // clap wants all of its strings tied to a particular lifetime, but // we'd really like to determine some default values dynamically. Using // a lazy_static here is one way of safely giving a static lifetime to // a value that is computed at runtime. // // An alternative approach would be to compute all of our default // values in the caller, and pass them into this function. It's nicer // to defined what we need here though. Locality of reference and all // that. static ref DATA_DIR: PathBuf = env::temp_dir().join("imdb-rename"); } App::new("imdb-rename") .author(clap::crate_authors!()) .version(clap::crate_version!()) .max_term_width(100) .setting(AppSettings::UnifiedHelpMessage) .arg(Arg::with_name("file") .multiple(true) .help("One or more files to rename.")) .arg(Arg::with_name("data-dir") .long("data-dir") .env("IMDB_RENAME_DATA_DIR") .takes_value(true) .default_value_os(DATA_DIR.as_os_str()) .help("The location to store IMDb data files.")) .arg(Arg::with_name("dest-dir") .long("dest-dir") .short("d") .env("IMDB_RENAME_DEST_DIR") .takes_value(true) .help("The output directory of renamed files \ (or symlinks/hardlinks with the -s/-H options). \ By default, files are renamed in place.")) .arg(Arg::with_name("debug") .long("debug") .help("Show debug messages. Use this when filing bugs.")) .arg(Arg::with_name("follow") .long("follow") .short("f") .help("Follow directories and attempt to rename all child \ entries.")) .arg(Arg::with_name("index-dir") .long("index-dir") .env("IMDB_RENAME_INDEX_DIR") .takes_value(true) .help("The location to store IMDb index files. \ When absent, the default is {data-dir}/index.")) .arg(Arg::with_name("ngram-size") .long("ngram-size") .default_value("3") .help("Choose the ngram size for indexing names. This is only \ used at index time and otherwise ignored.")) .arg(Arg::with_name("ngram-type") .long("ngram-type") .default_value("window") .possible_values(NgramType::possible_names()) .help("Choose the type of ngram generation. This is only used \ used at index time and otherwise ignored.")) .arg(Arg::with_name("query") .long("query") .short("q") .takes_value(true) .help("Setting an override query is necessary if the file \ path lacks sufficient information to find a matching \ title. For example, if a year could not be found. It \ is also useful for specifying a TV show when renaming \ multiple episodes at once.")) .arg(Arg::with_name("re-episode") .long("re-episode") .takes_value(true) .default_value(r"[Ee](?P[0-9]+)") .help("A regex for matching episode numbers. The episode number \ is extracted by looking for a 'episode' capture group.")) .arg(Arg::with_name("re-season") .long("re-season") .takes_value(true) .default_value(r"[Ss](?P[0-9]+)") .help("A regex for matching season numbers. The season number \ is extracted by looking for a 'season' capture group.")) .arg(Arg::with_name("re-year") .long("re-year") .takes_value(true) .default_value(r"\b(?P[0-9]{4})\b") .help("A regex for matching the year. The year is extracted by \ looking for a 'year' capture group.")) .arg(Arg::with_name("update-data") .long("update-data") .help("Forcefully refreshes the IMDb data and then exits.")) .arg(Arg::with_name("votes") .long("votes") .default_value("1000") .help("The minimum number of votes required for results matching \ a query derived from existing file names. This is not \ applied to explicit queries via the -q/--query flag.")) .arg(Arg::with_name("update-index") .long("update-index") .help("Forcefully re-indexes the IMDb data and then exits.")) .arg(Arg::with_name("symlink") .long("symlink") .short("s") .conflicts_with("hardlink") .help("Create a symlink instead of renaming. \ (Unix only feature.)")) .arg(Arg::with_name("hardlink") .long("hardlink") .short("H") .conflicts_with("symlink") .help("Create a hardlink instead of renaming. \ This doesn't work when renaming directories.")) } /// Collect all file paths from a sequence of OsStrings from the command line. /// If `follow` is true, then any paths that are directories are expanded to /// include all child paths, recursively. /// /// If there is an error following a path, then it is logged to stderr and /// otherwise skipped. fn collect_paths(paths: Vec<&OsStr>, follow: bool) -> Vec { let mut results = vec![]; for path in paths { let path = PathBuf::from(path); if !follow || !path.is_dir() { results.push(path); continue; } for result in WalkDir::new(path) { match result { Ok(dent) => results.push(dent.path().to_path_buf()), Err(err) => eprintln!("{}", err), } } } results } /// Return true if and only if an I/O broken pipe error exists in the causal /// chain of the given error. fn is_pipe_error(err: &anyhow::Error) -> bool { for cause in err.chain() { if let Some(ioerr) = cause.downcast_ref::() { if ioerr.kind() == io::ErrorKind::BrokenPipe { return true; } } } false } ================================================ FILE: src/rename.rs ================================================ use std::collections::{HashMap, HashSet}; use std::fmt; use std::fs; use std::path::{Path, PathBuf}; use std::sync::Mutex; use imdb_index::{MediaEntity, Query, SearchResults, Searcher, TitleKind}; use lazy_static::lazy_static; use regex::Regex; use crate::util::choose; /// A proposal to rename a `src` file path to a `dst` file path. #[derive(Clone, Debug)] pub struct RenameProposal { src: PathBuf, dst: PathBuf, action: RenameAction, } /// The action to take when renaming a file. #[derive(Copy, Clone, Debug, PartialEq)] pub enum RenameAction { /// This does a simple rename of the file. Rename, /// This creates a symlink to the given file. Symlink, /// This creates a hardlink to the given file. Hardlink, } impl fmt::Display for RenameAction { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { RenameAction::Rename => "rename", RenameAction::Symlink => "symlink", RenameAction::Hardlink => "hardlink", } .fmt(f) } } impl RenameAction { fn is_link(&self) -> bool { match *self { RenameAction::Rename => false, RenameAction::Symlink | RenameAction::Hardlink => true, } } } impl RenameProposal { /// Create a new proposal with the given source and destination. The /// destination is constructed by joining `dst_parent` with `dst_name`. /// `dst_name` is sanitized to be safe as a file name. /// /// The given action determines whether to rename the source to the /// destination, create a symlink or create a hardlink. fn new( src: PathBuf, dst_parent: &Path, dst_name: &str, action: RenameAction, ) -> RenameProposal { lazy_static! { static ref RE_BAD_PATH_CHARS: Regex = Regex::new(r"[\x00/]",).unwrap(); } let name = RE_BAD_PATH_CHARS.replace_all(dst_name, "_"); RenameProposal { src, dst: dst_parent.join(&*name), action } } /// Execute this proposal according to `RenameAction`. pub fn rename(&self) -> anyhow::Result<()> { match self.action { RenameAction::Rename => { fs::rename(&self.src, &self.dst).map_err(|e| { anyhow::anyhow!( "error renaming '{}' to '{}': {}", self.src.display(), self.dst.display(), e, ) })?; } #[cfg(not(unix))] RenameAction::Symlink => { anyhow::bail!("symlinks are only supported for Unix") } #[cfg(unix)] RenameAction::Symlink => { use std::os::unix; unix::fs::symlink(&self.src, &self.dst).map_err(|e| { anyhow::anyhow!( "error symlinking '{}' to '{}': {}", self.src.display(), self.dst.display(), e, ) })?; } RenameAction::Hardlink => { fs::hard_link(&self.src, &self.dst).map_err(|e| { anyhow::anyhow!( "error hardlinking '{}' to '{}': {}", self.src.display(), self.dst.display(), e, ) })?; } } Ok(()) } /// The `src` of this proposal. pub fn src(&self) -> &Path { &self.src } /// The `dst` of this proposal. /// /// Note that the destination is cleansed such that it is safe for /// renaming. e.g., If a `/` occurs in an IMDb title, then it is replaced /// with `_`. pub fn dst(&self) -> &Path { &self.dst } } /// A renamer generates file rename proposals based on IMDb. /// /// Fundamentally, a renamer is an entity linker, which attempts to connect /// file paths on your system that follow a prescribed pattern with canonical /// entity entries in IMDb. /// /// A renamer can be built via a `RenamerBuilder`, and proposals can be /// generated via the `propose` method on `Renamer`. A `Renamer` itself never /// touches the file system. #[derive(Debug)] pub struct Renamer { cache: Mutex>>, choose_cache: Mutex>, force: Option, min_votes: u32, good_threshold: f64, episode: Regex, season: Regex, year: Regex, } impl Renamer { /// Propose a set of renames, where each proposal proposes to rename a /// path in the slice given to a new path using its proper title according /// to IMDb. This never executes any changes to the file system. /// /// This returns an error if any two of the proposals recommend an exactly /// equivalent destination path. An error is also returned if a destination /// path already exists. Finally, the proposals are sorted in descending /// order of path length if any one of them is a directory, which should /// permit changing entries in a directory and a directory itself in one /// go. /// /// An optional destination can be given, which when present, is used as /// the directory in which renames/links are created. Similarly, the action /// given specifies whether the proposal should rename a file, symlink to /// it or hardlink to it. /// /// Note that this may log some types of errors to stderr but otherwise /// continue, which means that the set of proposals returned may not cover /// all paths given. Errors resulting from reading the index will cause an /// error to be returned. pub fn propose( &self, searcher: &mut Searcher, paths: &[PathBuf], dest: Option, action: RenameAction, ) -> anyhow::Result> { let mut proposals = vec![]; for path in paths { let result = self.propose_one(searcher, path, dest.as_deref(), action); let proposal = match result { None => continue, Some(proposal) => proposal, }; // If there's no change, then skip it. if proposal.src == proposal.dst { continue; } proposals.push(proposal); } // Check that we have no destination duplicates. If we permit them, // then it would be pretty easy to clobber the user's data. That's bad. // // We also make sure that the destination doesn't already exist. This // isn't atomic, but it's probably a fine approximation. let mut seen = HashSet::new(); let mut any_dir = false; for p in &proposals { if seen.contains(&p.dst) { anyhow::bail!( "duplicate rename proposal for '{}'", p.dst.display() ); } seen.insert(p.dst.clone()); if p.dst.exists() { anyhow::bail!( "file path '{}' already exists", p.dst.display() ); } any_dir = any_dir || p.src.is_dir(); } // Finally, sort the proposals such that the longest ones come first. // This should cause child entries to get renamed before parent // entries. if any_dir { proposals.sort_by(|p1, p2| { let (p1, p2) = (p1.dst.as_os_str(), p2.dst.as_os_str()); p1.len().cmp(&p2.len()).reverse() }); } Ok(proposals) } /// Propose a single rename for the given path. /// /// If an error occurs while searching, or if searching yields no results, /// or if an unexpected condition was hit, then an error is logged to /// stderr and `None` is returned. fn propose_one( &self, searcher: &mut Searcher, path: &Path, dest: Option<&Path>, action: RenameAction, ) -> Option { let candidate = match self.candidate(path) { Ok(candidate) => candidate, Err(err) => { eprintln!("[skipping] could not parse file path: {}", err); return None; } }; let result = match candidate.kind { CandidateKind::Any(ref x) => self.find_any(searcher, x), CandidateKind::Episode(ref x) => self.find_episode(searcher, x), CandidateKind::Unknown => self.find_unknown(), }; let ent = match result { Ok(ent) => ent, Err(err) => { eprintln!( "[skipping] error searching for {}: {}", path.display(), err, ); return None; } }; // Setup our sources and destinations. They get tweaked depending on // what our rename action is and whether a destination directory was // explicitly given. let dest_name = candidate.path.imdb_name(&ent); let mut src_path = path.to_path_buf(); let mut dest_parent_dir = dest.map(|d| d.to_path_buf()).unwrap_or(candidate.path.parent); // A symlink was requested to be created in a destination presumably // different than the current directory. This means that the file // specified on the commandline will need to be an absolute path, // otherwise the symlink will not point to the correct place. if dest.is_some() && action == RenameAction::Symlink { src_path = match src_path.canonicalize() { Ok(src_path) => src_path, Err(err) => { eprintln!( "[skipping] error making {} an absolute path: {}", src_path.display(), err, ); return None; } }; } // A symlink or hardlink was requested to be created without a // destination specified. In this case, it only makes sense to place // the symlink in the current directory being executed from, otherwise // potentially relative file paths won't match up. if dest.is_none() && action.is_link() { dest_parent_dir = match std::env::current_dir() { Ok(cwd) => cwd, Err(err) => { eprintln!( "[skipping] error getting current directory: {}", err, ); return None; } }; } Some(RenameProposal::new( src_path, &dest_parent_dir, &dest_name, action, )) } /// Search for any entity via its name and a year. In general, this is /// enough information to narrow down the results considerably for most /// movies. /// /// If an entity override is provided, then that is returned instead. fn find_any( &self, searcher: &mut Searcher, candidate: &CandidateAny, ) -> anyhow::Result { // If we already have an entity override, then just use that to build // the proposal and skip any automatic searches. if let Some(ref ent) = self.force { return Ok(ent.clone()); } // Otherwise, try to figure out the "right" name by constructing a // query from the candidate and searching IMDb. let query = self .name_query(&candidate.title) .year_ge(candidate.year) .year_le(candidate.year) // Basically include every kind except for episode and video games. // This helps filter out a lot of noise. .kind(TitleKind::Movie) .kind(TitleKind::Short) .kind(TitleKind::TVMiniSeries) .kind(TitleKind::TVMovie) .kind(TitleKind::TVSeries) .kind(TitleKind::TVShort) .kind(TitleKind::TVSpecial) .kind(TitleKind::Video) .votes_ge(self.min_votes); log::debug!("automatic 'any' query: {:?}", query); self.choose_one(searcher, &query) } /// Search for the episode entity corresponding to the episode information /// in the given candidate. If one couldn't be found, then an error is /// returned. /// /// This works by assuming the candidate episode's name is actually the /// TV show name. So we first look for the TV show entity, and then use /// that to find the corresponding episode. fn find_episode( &self, searcher: &mut Searcher, candidate: &CandidateEpisode, ) -> anyhow::Result { let tvshow = self.find_tvshow_for_episode(searcher, candidate)?; let eps = searcher.index().episodes(&tvshow.title().id, candidate.season)?; let ep = match eps .into_iter() .find(|ep| ep.episode == Some(candidate.episode)) { Some(ep) => ep, None => anyhow::bail!( "could not find S{:02}E{:02} for TV show {}", candidate.season, candidate.episode, tvshow.title().id, ), }; match searcher.index().entity(&ep.id)? { Some(ent) => Ok(ent), None => anyhow::bail!( "could not find media entity for episode {}", ep.id ), } } /// Search for the TV show entity corresponding to the episode information /// in the given candidate. If one couldn't be found, then an error is /// returned. /// /// If there is an entity override, then it is used instead. If the /// override isn't a TV show, then an error is returned. fn find_tvshow_for_episode( &self, searcher: &mut Searcher, candidate: &CandidateEpisode, ) -> anyhow::Result { // If we already have an entity override, then just use that as the // TV show. If it isn't a TV show, then return an error. if let Some(ref ent) = self.force { if !ent.title().kind.is_tv_series() { anyhow::bail!( "expected TV show to rename episode, but found {}", ent.title().kind ); } return Ok(ent.clone()); } // Otherwise, try to figure out the "right" TV show by constructing a // query from the candidate and searching IMDb. let query = self .name_query(&candidate.tvshow_title) .kind(TitleKind::TVMiniSeries) .kind(TitleKind::TVSeries) .votes_ge(self.min_votes); log::debug!("automatic 'tvshow for episode' query: {:?}", query); self.choose_one(searcher, &query) } /// Return an entity for a completely unknown candidate. /// /// This is invariant with respect to the source path, since we don't /// really know how to interpret it (and if we did, it shouldn't be /// unknown). Therefore, we always defer to the explicit override. If there /// is no override, then this returns an error. /// /// This is useful for renaming files like 'English.srt', where the path /// doesn't contain any useful information and an override is necessary /// anyway. fn find_unknown(&self) -> anyhow::Result { match self.force { Some(ref ent) => Ok(ent.clone()), None => { anyhow::bail!( "could not parse file path and there is no override \ set via -q/--query" ); } } } /// Produce a structured candidate for renaming from a source path. /// /// The candidate returned represents a heuristic analysis performed on /// the source path, and in particular, represents what we think the path /// represents. Principally, this consists of three categories: TV episode, /// any named title with a year, and then everything else. The type of /// candidate we have determines how we guess its canonical entry in IMDb. fn candidate(&self, path: &Path) -> anyhow::Result { let cpath = CandidatePath::from_path(path)?; let name = cpath.base_name.clone(); if let Some(cepisode) = self.episode_parts(&cpath)? { return Ok(Candidate { path: cpath, kind: CandidateKind::Episode(cepisode), }); } let caps_year = match self.year.captures(&name) { None => { return Ok(Candidate { path: cpath, kind: CandidateKind::Unknown, }) } Some(caps) => caps, }; let mat_year = match caps_year.name("year") { None => anyhow::bail!("missing 'year' group in: {}", self.year), Some(mat) => mat, }; let year = mat_year.as_str().parse()?; let title = name[..mat_year.start()].to_string(); Ok(Candidate { path: cpath, kind: CandidateKind::Any(CandidateAny { title, year }), }) } /// Part episode information from the given candidate, if it exists. /// /// If a problem occurred (like detecting a match but missing an expected /// capture group name), then an error is returned. If no episode info /// could be found, then `None` is returned. fn episode_parts( &self, cpath: &CandidatePath, ) -> anyhow::Result> { let name = &cpath.base_name; let caps_season = match self.season.captures(name) { None => return Ok(None), Some(caps) => caps, }; let caps_episode = match self.episode.captures(name) { None => return Ok(None), Some(caps) => caps, }; let mat_season = match caps_season.name("season") { None => { anyhow::bail!("missing 'season' group in: {}", self.season) } Some(mat) => mat, }; let mat_episode = match caps_episode.name("episode") { None => { anyhow::bail!("missing 'episode' group in: {}", self.episode) } Some(mat) => mat, }; let title_end = caps_season.get(0).unwrap().start(); Ok(Some(CandidateEpisode { tvshow_title: name[..title_end].to_string(), season: mat_season.as_str().parse()?, episode: mat_episode.as_str().parse()?, })) } /// Build a query and seed it with the given name, after sanitizing the /// name. fn name_query(&self, name: &str) -> Query { let name = name.replace(".", " "); let name = name.trim(); log::debug!("automatic name query: {:?}", name); Query::new().name(name) } /// Execute a search against the given searcher with the given query and /// choose a single result from the search. If no obvious single result /// stands out, then prompt the user for an answer. /// /// If the given query has been executed before, then returned the cached /// answer. fn choose_one( &self, searcher: &mut Searcher, query: &Query, ) -> anyhow::Result { let mut choose_cache = self.choose_cache.lock().unwrap(); if let Some(ent) = choose_cache.get(query) { return Ok(ent.clone()); } let results = self.search(searcher, query)?; let ent = choose(searcher, results.as_slice(), self.good_threshold)?; choose_cache.insert(query.clone(), ent.clone()); Ok(ent) } /// Execute a search against the given searcher with the given query. /// /// If this exact query has been previously executed by this renamer, then /// a cache of results are returned. fn search( &self, searcher: &mut Searcher, query: &Query, ) -> anyhow::Result> { let mut cache = self.cache.lock().unwrap(); if let Some(results) = cache.get(query) { return Ok(results.clone()); } let results = searcher.search(query)?; cache.insert(query.clone(), results.clone()); Ok(results) } } /// A candidate represents a source file path with additional structured /// information that helps us guess what its corresponding canonical IMDb /// entity is. #[derive(Clone, Debug)] struct Candidate { /// The original path that this candidate was drawn from. The path is /// split up into its parent, name and extension components. path: CandidatePath, /// The type of candidate, with potentially additional information /// depending on the type. kind: CandidateKind, } /// A representation of a source path that we'd like to rename. /// /// It is split up into non-overlapping component pieces to make guessing /// easier. In particular, the `parent` and `ext` fields generally aren't /// involved in the guessing process, but are used for reassembling a final /// proposed file path to rename to. In general, only the `base_name` is used /// for guessing. /// /// Note that it is not possible to split every possible path into these /// component pieces. Generally, such paths aren't readily guessable, so they /// are skipped (with an error message logged to stderr). #[derive(Clone, Debug)] struct CandidatePath { /// The parent component of the path. e.g., `/foo` in `/foo/bar.mkv`. parent: PathBuf, /// The base name of this path, minus the extention. e.g., `bar` in /// `/foo/bar.mkv`. base_name: String, /// The extension of this path, if it exists, minus the leading `.`. /// e.g., `mkv` in `/foo/bar.mkv`. ext: Option, } /// Type of a candidate, including any additional type-specific information. #[derive(Clone, Debug)] enum CandidateKind { /// A general description of any candidate, with a minimal requirement: /// the source file path must contain a year. Any(CandidateAny), /// A description of a candidate that we believe to be an episode, which /// includes the TV show name, the season number and the episode number. Episode(CandidateEpisode), /// Anything else. Generally, these's nothing we can assume about this /// type, but if the user specifies an override, then we'll still be able /// to rename it. If no override is given, then a candidate with this type /// is skipped. Unknown, } /// A general description of any candidate with a name and a year. The name /// is generally assumed to be all the text preceding the year in the base name /// of a file path. /// /// When we initiate a guess based on this candidate type, we assume it can /// correspond to any entity in IMDb except for TV show episodes. #[derive(Clone, Debug)] struct CandidateAny { /// The presumed title. title: String, /// The presumed year. year: u32, } /// A description of a candidate that we believe to be an episode. This means /// we have captured what we believe to be the TV show's name, along with the /// season and episode numbers. The TV show's name is generally assumed to be /// all the text preceding the season number in the base name of a file path. #[derive(Clone, Debug)] struct CandidateEpisode { /// The presumed TV show title. tvshow_title: String, /// The season number. season: u32, /// The episode number. episode: u32, } impl CandidatePath { /// Build a candidate path from a source file path. If a path could not /// be built, then an error is returned. fn from_path(path: &Path) -> anyhow::Result { let parent = match path.parent() { None => anyhow::bail!( "{}: has no parent, cannot rename", path.display() ), Some(parent) => parent.to_path_buf(), }; let name_os = match path.file_name() { None => anyhow::bail!("{}: missing file name", path.display()), Some(name_os) => name_os, }; let name = match name_os.to_str() { None => anyhow::bail!( "{}: invalid UTF-8, cannot rename", path.display() ), Some(name) => name, }; let (base_name, ext) = if path.is_dir() { (name.to_string(), None) } else { match name.rfind('.') { None => (name.to_string(), None), Some(i) => { (name[..i].to_string(), Some(name[i + 1..].to_string())) } } }; Ok(CandidatePath { parent, base_name, ext }) } /// Convert this candidate path to the desired name based on an IMDb /// entity. In general, this replaces the `base_name` of this candidate /// with the title found in the given entity. fn imdb_name(&self, ent: &MediaEntity) -> String { let name = match ent.episode() { Some(ep) => format!( "S{:02}E{:02} - {}", ep.season.unwrap_or(0), ep.episode.unwrap_or(0), ent.title().title, ), None => match ent.title().start_year { None => ent.title().title.to_string(), Some(year) => format!("{} ({})", ent.title().title, year), }, }; match self.ext { None => name, Some(ref ext) => format!("{}.{}", name, ext), } } } /// A builder for configuring a renamer. #[derive(Clone, Debug)] pub struct RenamerBuilder { force: Option, min_votes: u32, good_threshold: f64, regex_episode: String, regex_season: String, regex_year: String, } impl RenamerBuilder { /// Create a `RenamerBuilder` with default settings. pub fn new() -> RenamerBuilder { RenamerBuilder { force: None, min_votes: 1000, good_threshold: 0.25, regex_episode: r"[Ee](?P[0-9]+)".into(), regex_season: r"[Ss](?P[0-9]+)".into(), regex_year: r"\b(?P[0-9]{4})\b".into(), } } /// Build a `Renamer` from the current configuration. pub fn build(&self) -> anyhow::Result { Ok(Renamer { cache: Mutex::new(HashMap::new()), choose_cache: Mutex::new(HashMap::new()), force: self.force.clone(), min_votes: self.min_votes, good_threshold: self.good_threshold, episode: Regex::new(&self.regex_episode)?, season: Regex::new(&self.regex_season)?, year: Regex::new(&self.regex_year)?, }) } /// Forcefully use the given entity when producing rename proposals. /// /// When an entity is given here, the renamer will never execute automatic /// queries based on the file name. Instead, it will rename every path /// given using this entity. /// /// If a path to be renamed is determined to be a TV episode, then this /// entity is assumed to be the entity corresponding to that episode's /// TV show. Otherwise, an error will be returned. pub fn force(&mut self, entity: MediaEntity) -> &mut RenamerBuilder { self.force = Some(entity); self } /// Set the minimum number of votes required for all search results from /// automatic queries. This is used when formulating queries based on file /// names that aren't TV episodes. The purpose of this is to heuristically /// filter out noise from the IMDb data. /// /// When this isn't specified, a non-zero default is used. pub fn min_votes(&mut self, min_votes: u32) -> &mut RenamerBuilder { self.min_votes = min_votes; self } /// Sets the "good" threshold for auto-selection. /// /// When running queries generated from file paths, it is often the case /// that multiple results will be returned. If the difference in score /// between the first result and second result is greater than or equal /// to this threshold, then the first result will be automatically chosen. /// Otherwise, a prompt will be shown to the end user requesting an /// explicit selection. pub fn good_threshold(&mut self, threshold: f64) -> &mut RenamerBuilder { self.good_threshold = threshold; self } /// Set the regex for detecting the episode number from a file path. /// /// Regexes are executed against the base name of a path. The episode /// number is extracted via the `episode` named capture group. pub fn regex_episode(&mut self, pattern: &str) -> &mut RenamerBuilder { self.regex_episode = pattern.to_string(); self } /// Set the regex for detecting the season number from a file path. /// /// Regexes are executed against the base name of a path. The season /// number is extracted via the `season` named capture group. pub fn regex_season(&mut self, pattern: &str) -> &mut RenamerBuilder { self.regex_season = pattern.to_string(); self } /// Set the regex for detecting the year from a file path. /// /// Regexes are executed against the base name of a path. The year is /// extracted via the `year` named capture group. pub fn regex_year(&mut self, pattern: &str) -> &mut RenamerBuilder { self.regex_year = pattern.to_string(); self } } impl Default for RenamerBuilder { fn default() -> RenamerBuilder { RenamerBuilder::new() } } ================================================ FILE: src/util.rs ================================================ use std::io::{self, Write}; use imdb_index::{Episode, MediaEntity, Scored, Searcher, Title}; use tabwriter::TabWriter; /// Make a choice among the search results given. /// /// If there is no clear winner, then a prompt is shown to the end user, where /// they must make a selection. If a selection is absent or invalid, then an /// error is returned. /// /// The threshold given determines the automatic selection criteria. Namely, /// if the difference of scores between the first and second results is /// greater than or equal to the given threshold, then the first result is /// returned without prompted the end user. pub fn choose( searcher: &mut Searcher, results: &[Scored], good_threshold: f64, ) -> anyhow::Result { if results.is_empty() { anyhow::bail!("no search results available for query"); } else if results.len() == 1 { return Ok(results[0].clone().into_value()); } else if (results[0].score() - results[1].score()) >= good_threshold { return Ok(results[0].clone().into_value()); } write_tsv(io::stdout(), searcher, results)?; let choice = read_number(1, results.len())?; Ok(results[choice - 1].clone().into_value()) } /// Reads a number from stdin in the given inclusive range. pub fn read_number(start: usize, end: usize) -> anyhow::Result { let mut stdout = io::stdout(); write!(stdout, "Please enter your choice [{}-{}]: ", start, end)?; stdout.flush()?; let mut response = String::new(); io::stdin().read_line(&mut response)?; let choice: usize = response.trim().parse()?; if choice < start || choice > end { anyhow::bail!( "invalid choice: {} is not in range [{}-{}]", choice, start, end ); } Ok(choice) } /// Reads a yes/no answer from stdin. This is flexible and recognizes /// y, Y, yes, YES as 'yes' answers. Everything else is recognized as a 'no' /// answer. pub fn read_yesno(msg: &str) -> anyhow::Result { let mut stdout = io::stdout(); write!(stdout, "{}", msg)?; stdout.flush()?; let mut response = String::new(); io::stdin().read_line(&mut response)?; let answer = response.trim().to_lowercase(); Ok(answer == "y" || answer == "yes") } /// Write the given result set to the given writer. /// /// If a result is an episode, then the index given is used to look up relevant /// info about its TV show, if one could be found, and include that information /// in the output. pub fn write_tsv( wtr: W, searcher: &mut Searcher, results: &[Scored], ) -> anyhow::Result<()> { let mut wtr = TabWriter::new(wtr).minwidth(4); writeln!(wtr, "#\tscore\tid\tkind\ttitle\tyear\ttv")?; for (i, sr) in results.iter().enumerate() { let (score, ent) = (sr.score(), sr.value()); if let Some(ep) = ent.episode() { match searcher.index().title(&ep.tvshow_id)? { None => write_tsv_title(&mut wtr, i + 1, score, ent)?, Some(tvshow) => { write_tsv_episode( &mut wtr, i + 1, score, ent, &tvshow, ep, )?; } } } else { write_tsv_title(&mut wtr, i + 1, score, ent)?; } } wtr.flush()?; Ok(()) } fn write_tsv_title( mut wtr: W, position: usize, score: f64, ent: &MediaEntity, ) -> anyhow::Result<()> { write!( wtr, "{}\t{:0.3}\t{}\t{}\t{}\t{}", position, score, ent.title().id, ent.title().kind, ent.title().title, ent.title() .start_year .map(|y| y.to_string()) .unwrap_or("N/A".to_string()), )?; write!(wtr, "\n")?; Ok(()) } fn write_tsv_episode( mut wtr: W, position: usize, score: f64, ent: &MediaEntity, tvshow: &Title, ep: &Episode, ) -> anyhow::Result<()> { let tvinfo = format!( "S{:02}E{:02} {}", ep.season.unwrap_or(0), ep.episode.unwrap_or(0), tvshow.title, ); write!( wtr, "{}\t{:0.3}\t{}\t{}\t{}\t{}\t{}", position, score, ent.title().id, ent.title().kind, ent.title().title, ent.title() .start_year .map(|y| y.to_string()) .unwrap_or("N/A".to_string()), tvinfo, )?; write!(wtr, "\n")?; Ok(()) }