Repository: tsoding/seroost Branch: main Commit: 94b6383b0e2b Files: 16 Total size: 57.4 KB Directory structure: gitextract_z9nm0izk/ ├── .gitignore ├── CONTRIBUTING.md ├── Cargo.toml ├── LICENSE ├── README.md └── src/ ├── index.html ├── index.js ├── lexer.rs ├── main.rs ├── model.rs ├── server.rs └── snowball/ ├── algorithms/ │ ├── english_stemmer.rs │ └── mod.rs ├── among.rs ├── mod.rs └── snowball_env.rs ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ /target ================================================ FILE: CONTRIBUTING.md ================================================ I have very limited resources in terms of handling feedback on my projects, sorry. So here are the limitations to keep in mind: - I don't look into reported Issues. - I only look into small PRs that suggest - bug fixes, - documentation fixes. - I do not look into PRs that - implement new features, - refactor/cleanup the code. - What qualifies as a bug, a feature, or refactoring is entirely upon my interpretation. Sorry for any inconveniences. If you want to stir the project in a particular direction in terms of features feel free to fork it, I don't mind. Just make sure you have fun while developing it! This is like the whole point! ================================================ FILE: Cargo.toml ================================================ [package] name = "seroost" version = "0.1.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] poppler-rs = "0.21.0" serde = { version = "1.0.152", features = ["derive"] } serde_json = "1.0.91" tiny_http = "0.12.0" xml-rs = "0.8.4" ================================================ FILE: LICENSE ================================================ Copyright 2023 Alexey Kutepov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Local Search Engine in Rust **THIS SOFTWARE IS UNFINISHED!!! Don't have any high expectations.** ## Quick Start ```console $ cargo run serve ./folder/ $ iexplore.exe http://localhost:6969/ ``` ================================================ FILE: src/index.html ================================================ Seroost

Provide Your Query:

================================================ FILE: src/index.js ================================================ // TODO: live update results as you type async function search(prompt) { const results = document.getElementById("results") results.innerHTML = ""; const response = await fetch("/api/search", { method: 'POST', headers: {'Content-Type': 'text/plain'}, body: prompt, }); const json = await response.json(); results.innerHTML = ""; for ([path, rank] of json) { let item = document.createElement("span"); item.appendChild(document.createTextNode(path)); item.appendChild(document.createElement("br")); results.appendChild(item); } } let query = document.getElementById("query"); let currentSearch = Promise.resolve() query.addEventListener("keypress", (e) => { if (e.key == "Enter") { currentSearch.then(() => search(query.value)); } }) ================================================ FILE: src/lexer.rs ================================================ pub struct Lexer<'a> { content: &'a [char], } impl<'a> Lexer<'a> { pub fn new(content: &'a [char]) -> Self { Self { content } } fn trim_left(&mut self) { while !self.content.is_empty() && self.content[0].is_whitespace() { self.content = &self.content[1..]; } } fn chop(&mut self, n: usize) -> &'a [char] { let token = &self.content[0..n]; self.content = &self.content[n..]; token } fn chop_while

(&mut self, mut predicate: P) -> &'a [char] where P: FnMut(&char) -> bool { let mut n = 0; while n < self.content.len() && predicate(&self.content[n]) { n += 1; } self.chop(n) } pub fn next_token(&mut self) -> Option { self.trim_left(); if self.content.is_empty() { return None } if self.content[0].is_numeric() { return Some(self.chop_while(|x| x.is_numeric()).iter().collect()); } if self.content[0].is_alphabetic() { let term = self.chop_while(|x| x.is_alphanumeric()).iter().map(|x| x.to_ascii_lowercase()).collect::(); let mut env = crate::snowball::SnowballEnv::create(&term); crate::snowball::algorithms::english_stemmer::stem(&mut env); let stemmed_term = env.get_current().to_string(); return Some(stemmed_term); } return Some(self.chop(1).iter().collect()); } } impl<'a> Iterator for Lexer<'a> { type Item = String; fn next(&mut self) -> Option { self.next_token() } } ================================================ FILE: src/main.rs ================================================ use std::fs::{self, File}; use std::path::{Path}; use xml::reader::{XmlEvent, EventReader}; use xml::common::{Position, TextPosition}; use std::env; use std::result::Result; use std::process::ExitCode; use std::str; use std::io::{BufReader, BufWriter}; use std::sync::{Arc, Mutex}; use std::thread; mod model; use model::*; mod server; mod lexer; pub mod snowball; fn parse_entire_txt_file(file_path: &Path) -> Result { fs::read_to_string(file_path).map_err(|err| { eprintln!("ERROR: coult not open file {file_path}: {err}", file_path = file_path.display()); }) } fn parse_entire_pdf_file(file_path: &Path) -> Result { use poppler::Document; use std::io::Read; let mut content = Vec::new(); File::open(file_path) .and_then(|mut file| file.read_to_end(&mut content)) .map_err(|err| { eprintln!("ERROR: could not read file {file_path}: {err}", file_path = file_path.display()); })?; let pdf = Document::from_data(&content, None).map_err(|err| { eprintln!("ERROR: could not read file {file_path}: {err}", file_path = file_path.display()); })?; let mut result = String::new(); let n = pdf.n_pages(); for i in 0..n { let page = pdf.page(i).expect(&format!("{i} is within the bounds of the range of the page")); if let Some(content) = page.text() { result.push_str(content.as_str()); result.push(' '); } } Ok(result) } fn parse_entire_xml_file(file_path: &Path) -> Result { let file = File::open(file_path).map_err(|err| { eprintln!("ERROR: could not open file {file_path}: {err}", file_path = file_path.display()); })?; let er = EventReader::new(BufReader::new(file)); let mut content = String::new(); for event in er.into_iter() { let event = event.map_err(|err| { let TextPosition {row, column} = err.position(); let msg = err.msg(); eprintln!("{file_path}:{row}:{column}: ERROR: {msg}", file_path = file_path.display()); })?; if let XmlEvent::Characters(text) = event { content.push_str(&text); content.push(' '); } } Ok(content) } fn parse_entire_file_by_extension(file_path: &Path) -> Result { let extension = file_path.extension().ok_or_else(|| { eprintln!("ERROR: can't detect file type of {file_path} without extension", file_path = file_path.display()); })?.to_string_lossy(); match extension.as_ref() { "xhtml" | "xml" => parse_entire_xml_file(file_path), // TODO: specialized parser for markdown files "txt" | "md" => parse_entire_txt_file(file_path), "pdf" => parse_entire_pdf_file(file_path), _ => { eprintln!("ERROR: can't detect file type of {file_path}: unsupported extension {extension}", file_path = file_path.display(), extension = extension); Err(()) } } } fn save_model_as_json(model: &Model, index_path: &Path) -> Result<(), ()> { println!("Saving {index_path}...", index_path = index_path.display()); let index_file = File::create(index_path).map_err(|err| { eprintln!("ERROR: could not create index file {index_path}: {err}", index_path = index_path.display()); })?; serde_json::to_writer(BufWriter::new(index_file), &model).map_err(|err| { eprintln!("ERROR: could not serialize index into file {index_path}: {err}", index_path = index_path.display()); })?; Ok(()) } fn add_folder_to_model(dir_path: &Path, model: Arc>, processed: &mut usize) -> Result<(), ()> { let dir = fs::read_dir(dir_path).map_err(|err| { eprintln!("ERROR: could not open directory {dir_path} for indexing: {err}", dir_path = dir_path.display()); })?; 'next_file: for file in dir { let file = file.map_err(|err| { eprintln!("ERROR: could not read next file in directory {dir_path} during indexing: {err}", dir_path = dir_path.display()); })?; let file_path = file.path(); let dot_file = file_path .file_name() .and_then(|s| s.to_str()) .map(|s| s.starts_with(".")) .unwrap_or(false); if dot_file { continue 'next_file; } let file_type = file.file_type().map_err(|err| { eprintln!("ERROR: could not determine type of file {file_path}: {err}", file_path = file_path.display()); })?; let last_modified = file.metadata().map_err(|err| { eprintln!("ERROR: could not get the metadata of file {file_path}: {err}", file_path = file_path.display()); })?.modified().map_err(|err| { eprintln!("ERROR: could not get the last modification date of file {file_path}: {err}", file_path = file_path.display()) })?; if file_type.is_dir() { add_folder_to_model(&file_path, Arc::clone(&model), processed)?; continue 'next_file; } // TODO: how does this work with symlinks? let mut model = model.lock().unwrap(); if model.requires_reindexing(&file_path, last_modified) { println!("Indexing {:?}...", &file_path); let content = match parse_entire_file_by_extension(&file_path) { Ok(content) => content.chars().collect::>(), // TODO: still add the skipped files to the model to prevent their reindexing in the future Err(()) => continue 'next_file, }; model.add_document(file_path, last_modified, &content); *processed += 1; } } Ok(()) } fn usage(program: &str) { eprintln!("Usage: {program} [SUBCOMMAND] [OPTIONS]"); eprintln!("Subcommands:"); eprintln!(" serve [address] start local HTTP server with Web Interface"); } fn entry() -> Result<(), ()> { let mut args = env::args(); let program = args.next().expect("path to program is provided"); let subcommand = args.next().ok_or_else(|| { usage(&program); eprintln!("ERROR: no subcommand is provided"); })?; match subcommand.as_str() { "serve" => { let dir_path = args.next().ok_or_else(|| { usage(&program); eprintln!("ERROR: no directory is provided for {subcommand} subcommand"); })?; let mut index_path = Path::new(&dir_path).to_path_buf(); index_path.push(".seroost.json"); let address = args.next().unwrap_or("127.0.0.1:6969".to_string()); let exists = index_path.try_exists().map_err(|err| { eprintln!("ERROR: could not check the existence of file {index_path}: {err}", index_path = index_path.display()); })?; let model: Arc>; if exists { let index_file = File::open(&index_path).map_err(|err| { eprintln!("ERROR: could not open index file {index_path}: {err}", index_path = index_path.display()); })?; model = Arc::new(Mutex::new(serde_json::from_reader(index_file).map_err(|err| { eprintln!("ERROR: could not parse index file {index_path}: {err}", index_path = index_path.display()); })?)); } else { model = Arc::new(Mutex::new(Default::default())); } { let model = Arc::clone(&model); thread::spawn(move || { let mut processed = 0; // TODO: what should we do in case indexing thread crashes add_folder_to_model(Path::new(&dir_path), Arc::clone(&model), &mut processed).unwrap(); if processed > 0 { let model = model.lock().unwrap(); save_model_as_json(&model, &index_path).unwrap(); } println!("Finished indexing"); }); } server::start(&address, Arc::clone(&model)) } _ => { usage(&program); eprintln!("ERROR: unknown subcommand {subcommand}"); Err(()) } } } fn main() -> ExitCode { match entry() { Ok(()) => ExitCode::SUCCESS, Err(()) => ExitCode::FAILURE, } } // TODO: search result must consist of clickable links // TODO: synonym terms ================================================ FILE: src/model.rs ================================================ use std::collections::HashMap; use std::path::{PathBuf, Path}; use serde::{Deserialize, Serialize}; use super::lexer::Lexer; use std::time::SystemTime; type DocFreq = HashMap; type TermFreq = HashMap; #[derive(Deserialize, Serialize)] pub struct Doc { tf: TermFreq, count: usize, // TODO: make sure that the serde serialization of SystemTime also work on other platforms last_modified: SystemTime, } type Docs = HashMap; #[derive(Default, Deserialize, Serialize)] pub struct Model { pub docs: Docs, pub df: DocFreq, } impl Model { fn remove_document(&mut self, file_path: &Path) { if let Some(doc) = self.docs.remove(file_path) { for t in doc.tf.keys() { if let Some(f) = self.df.get_mut(t) { *f -= 1; } } } } pub fn requires_reindexing(&mut self, file_path: &Path, last_modified: SystemTime) -> bool { if let Some(doc) = self.docs.get(file_path) { return doc.last_modified < last_modified; } return true; } pub fn search_query(&self, query: &[char]) -> Vec<(PathBuf, f32)> { let mut result = Vec::new(); let tokens = Lexer::new(&query).collect::>(); for (path, doc) in &self.docs { let mut rank = 0f32; for token in &tokens { rank += compute_tf(token, doc) * compute_idf(&token, self.docs.len(), &self.df); } // TODO: investigate the sources of NaN if !rank.is_nan() { result.push((path.clone(), rank)); } } result.sort_by(|(_, rank1), (_, rank2)| rank1.partial_cmp(rank2).expect(&format!("{rank1} and {rank2} are not comparable"))); result.reverse(); result } pub fn add_document(&mut self, file_path: PathBuf, last_modified: SystemTime, content: &[char]) { self.remove_document(&file_path); let mut tf = TermFreq::new(); let mut count = 0; for t in Lexer::new(content) { if let Some(f) = tf.get_mut(&t) { *f += 1; } else { tf.insert(t, 1); } count += 1; } for t in tf.keys() { if let Some(f) = self.df.get_mut(t) { *f += 1; } else { self.df.insert(t.to_string(), 1); } } self.docs.insert(file_path, Doc {count, tf, last_modified}); } } fn compute_tf(t: &str, doc: &Doc) -> f32 { let n = doc.count as f32; let m = doc.tf.get(t).cloned().unwrap_or(0) as f32; m / n } fn compute_idf(t: &str, n: usize, df: &DocFreq) -> f32 { let n = n as f32; let m = df.get(t).cloned().unwrap_or(1) as f32; (n / m).log10() } ================================================ FILE: src/server.rs ================================================ use std::str; use std::io; use std::sync::{Arc, Mutex}; use super::model::*; use tiny_http::{Server, Request, Response, Header, Method, StatusCode}; fn serve_404(request: Request) -> io::Result<()> { request.respond(Response::from_string("404").with_status_code(StatusCode(404))) } fn serve_500(request: Request) -> io::Result<()> { request.respond(Response::from_string("500").with_status_code(StatusCode(500))) } fn serve_400(request: Request, message: &str) -> io::Result<()> { request.respond(Response::from_string(format!("400: {message}")).with_status_code(StatusCode(400))) } fn serve_bytes(request: Request, bytes: &[u8], content_type: &str) -> io::Result<()> { let content_type_header = Header::from_bytes("Content-Type", content_type) .expect("That we didn't put any garbage in the headers"); request.respond(Response::from_data(bytes).with_header(content_type_header)) } // TODO: the errors of serve_api_search should probably return JSON // 'Cause that's what expected from them. fn serve_api_search(model: Arc>, mut request: Request) -> io::Result<()> { let mut buf = Vec::new(); if let Err(err) = request.as_reader().read_to_end(&mut buf) { eprintln!("ERROR: could not read the body of the request: {err}"); return serve_500(request); } let body = match str::from_utf8(&buf) { Ok(body) => body.chars().collect::>(), Err(err) => { eprintln!("ERROR: could not interpret body as UTF-8 string: {err}"); return serve_400(request, "Body must be a valid UTF-8 string"); } }; let model = model.lock().unwrap(); let result = model.search_query(&body); let json = match serde_json::to_string(&result.iter().take(20).collect::>()) { Ok(json) => json, Err(err) => { eprintln!("ERROR: could not convert search results to JSON: {err}"); return serve_500(request) } }; let content_type_header = Header::from_bytes("Content-Type", "application/json") .expect("That we didn't put any garbage in the headers"); request.respond(Response::from_string(&json).with_header(content_type_header)) } fn serve_api_stats(model: Arc>, request: Request) -> io::Result<()> { use serde::Serialize; #[derive(Default, Serialize)] struct Stats { docs_count: usize, terms_count: usize, } let mut stats: Stats = Default::default(); { let model = model.lock().unwrap(); stats.docs_count = model.docs.len(); stats.terms_count = model.df.len(); } let json = match serde_json::to_string(&stats) { Ok(json) => json, Err(err) => { eprintln!("ERROR: could not convert stats results to JSON: {err}"); return serve_500(request) } }; let content_type_header = Header::from_bytes("Content-Type", "application/json") .expect("That we didn't put any garbage in the headers"); request.respond(Response::from_string(&json).with_header(content_type_header)) } fn serve_request(model: Arc>, request: Request) -> io::Result<()> { println!("INFO: received request! method: {:?}, url: {:?}", request.method(), request.url()); match (request.method(), request.url()) { (Method::Post, "/api/search") => { serve_api_search(model, request) } (Method::Get, "/api/stats") => { serve_api_stats(model, request) } (Method::Get, "/index.js") => { serve_bytes(request, include_bytes!("index.js"), "text/javascript; charset=utf-8") } (Method::Get, "/") | (Method::Get, "/index.html") => { serve_bytes(request, include_bytes!("index.html"), "text/html; charset=utf-8") } _ => { serve_404(request) } } } pub fn start(address: &str, model: Arc>) -> Result<(), ()> { let server = Server::http(&address).map_err(|err| { eprintln!("ERROR: could not start HTTP server at {address}: {err}"); })?; println!("INFO: listening at http://{address}/"); for request in server.incoming_requests() { serve_request(Arc::clone(&model), request).map_err(|err| { eprintln!("ERROR: could not serve the response: {err}"); }).ok(); // <- don't stop on errors, keep serving } eprintln!("ERROR: the server socket has shutdown"); Err(()) } ================================================ FILE: src/snowball/algorithms/english_stemmer.rs ================================================ //! Generated by Snowball 2.2.0 - https://snowballstem.org/ #![allow(non_snake_case)] #![allow(non_upper_case_globals)] #![allow(unused_mut)] #![allow(unused_parens)] #![allow(unused_variables)] use crate::snowball::SnowballEnv; use crate::snowball::Among; static A_0: &'static [Among; 3] = &[ Among("arsen", -1, -1, None), Among("commun", -1, -1, None), Among("gener", -1, -1, None), ]; static A_1: &'static [Among; 3] = &[ Among("'", -1, 1, None), Among("'s'", 0, 1, None), Among("'s", -1, 1, None), ]; static A_2: &'static [Among; 6] = &[ Among("ied", -1, 2, None), Among("s", -1, 3, None), Among("ies", 1, 2, None), Among("sses", 1, 1, None), Among("ss", 1, -1, None), Among("us", 1, -1, None), ]; static A_3: &'static [Among; 13] = &[ Among("", -1, 3, None), Among("bb", 0, 2, None), Among("dd", 0, 2, None), Among("ff", 0, 2, None), Among("gg", 0, 2, None), Among("bl", 0, 1, None), Among("mm", 0, 2, None), Among("nn", 0, 2, None), Among("pp", 0, 2, None), Among("rr", 0, 2, None), Among("at", 0, 1, None), Among("tt", 0, 2, None), Among("iz", 0, 1, None), ]; static A_4: &'static [Among; 6] = &[ Among("ed", -1, 2, None), Among("eed", 0, 1, None), Among("ing", -1, 2, None), Among("edly", -1, 2, None), Among("eedly", 3, 1, None), Among("ingly", -1, 2, None), ]; static A_5: &'static [Among; 24] = &[ Among("anci", -1, 3, None), Among("enci", -1, 2, None), Among("ogi", -1, 13, None), Among("li", -1, 15, None), Among("bli", 3, 12, None), Among("abli", 4, 4, None), Among("alli", 3, 8, None), Among("fulli", 3, 9, None), Among("lessli", 3, 14, None), Among("ousli", 3, 10, None), Among("entli", 3, 5, None), Among("aliti", -1, 8, None), Among("biliti", -1, 12, None), Among("iviti", -1, 11, None), Among("tional", -1, 1, None), Among("ational", 14, 7, None), Among("alism", -1, 8, None), Among("ation", -1, 7, None), Among("ization", 17, 6, None), Among("izer", -1, 6, None), Among("ator", -1, 7, None), Among("iveness", -1, 11, None), Among("fulness", -1, 9, None), Among("ousness", -1, 10, None), ]; static A_6: &'static [Among; 9] = &[ Among("icate", -1, 4, None), Among("ative", -1, 6, None), Among("alize", -1, 3, None), Among("iciti", -1, 4, None), Among("ical", -1, 4, None), Among("tional", -1, 1, None), Among("ational", 5, 2, None), Among("ful", -1, 5, None), Among("ness", -1, 5, None), ]; static A_7: &'static [Among; 18] = &[ Among("ic", -1, 1, None), Among("ance", -1, 1, None), Among("ence", -1, 1, None), Among("able", -1, 1, None), Among("ible", -1, 1, None), Among("ate", -1, 1, None), Among("ive", -1, 1, None), Among("ize", -1, 1, None), Among("iti", -1, 1, None), Among("al", -1, 1, None), Among("ism", -1, 1, None), Among("ion", -1, 2, None), Among("er", -1, 1, None), Among("ous", -1, 1, None), Among("ant", -1, 1, None), Among("ent", -1, 1, None), Among("ment", 15, 1, None), Among("ement", 16, 1, None), ]; static A_8: &'static [Among; 2] = &[ Among("e", -1, 1, None), Among("l", -1, 2, None), ]; static A_9: &'static [Among; 8] = &[ Among("succeed", -1, -1, None), Among("proceed", -1, -1, None), Among("exceed", -1, -1, None), Among("canning", -1, -1, None), Among("inning", -1, -1, None), Among("earring", -1, -1, None), Among("herring", -1, -1, None), Among("outing", -1, -1, None), ]; static A_10: &'static [Among; 18] = &[ Among("andes", -1, -1, None), Among("atlas", -1, -1, None), Among("bias", -1, -1, None), Among("cosmos", -1, -1, None), Among("dying", -1, 3, None), Among("early", -1, 9, None), Among("gently", -1, 7, None), Among("howe", -1, -1, None), Among("idly", -1, 6, None), Among("lying", -1, 4, None), Among("news", -1, -1, None), Among("only", -1, 10, None), Among("singly", -1, 11, None), Among("skies", -1, 2, None), Among("skis", -1, 1, None), Among("sky", -1, -1, None), Among("tying", -1, 5, None), Among("ugly", -1, 8, None), ]; static G_v: &'static [u8; 4] = &[17, 65, 16, 1]; static G_v_WXY: &'static [u8; 5] = &[1, 17, 65, 208, 1]; static G_valid_LI: &'static [u8; 3] = &[55, 141, 2]; #[derive(Clone)] struct Context { b_Y_found: bool, i_p2: i32, i_p1: i32, } fn r_prelude(env: &mut SnowballEnv, context: &mut Context) -> bool { context.b_Y_found = false; let v_1 = env.cursor; 'lab0: loop { env.bra = env.cursor; if !env.eq_s(&"'") { break 'lab0; } env.ket = env.cursor; if !env.slice_del() { return false; } break 'lab0; } env.cursor = v_1; let v_2 = env.cursor; 'lab1: loop { env.bra = env.cursor; if !env.eq_s(&"y") { break 'lab1; } env.ket = env.cursor; if !env.slice_from("Y") { return false; } context.b_Y_found = true; break 'lab1; } env.cursor = v_2; let v_3 = env.cursor; 'lab2: loop { 'replab3: loop{ let v_4 = env.cursor; 'lab4: for _ in 0..1 { 'golab5: loop { let v_5 = env.cursor; 'lab6: loop { if !env.in_grouping(G_v, 97, 121) { break 'lab6; } env.bra = env.cursor; if !env.eq_s(&"y") { break 'lab6; } env.ket = env.cursor; env.cursor = v_5; break 'golab5; } env.cursor = v_5; if env.cursor >= env.limit { break 'lab4; } env.next_char(); } if !env.slice_from("Y") { return false; } context.b_Y_found = true; continue 'replab3; } env.cursor = v_4; break 'replab3; } break 'lab2; } env.cursor = v_3; return true; } fn r_mark_regions(env: &mut SnowballEnv, context: &mut Context) -> bool { context.i_p1 = env.limit; context.i_p2 = env.limit; let v_1 = env.cursor; 'lab0: loop { 'lab1: loop { let v_2 = env.cursor; 'lab2: loop { if env.find_among(A_0, context) == 0 { break 'lab2; } break 'lab1; } env.cursor = v_2; 'golab3: loop { 'lab4: loop { if !env.in_grouping(G_v, 97, 121) { break 'lab4; } break 'golab3; } if env.cursor >= env.limit { break 'lab0; } env.next_char(); } 'golab5: loop { 'lab6: loop { if !env.out_grouping(G_v, 97, 121) { break 'lab6; } break 'golab5; } if env.cursor >= env.limit { break 'lab0; } env.next_char(); } break 'lab1; } context.i_p1 = env.cursor; 'golab7: loop { 'lab8: loop { if !env.in_grouping(G_v, 97, 121) { break 'lab8; } break 'golab7; } if env.cursor >= env.limit { break 'lab0; } env.next_char(); } 'golab9: loop { 'lab10: loop { if !env.out_grouping(G_v, 97, 121) { break 'lab10; } break 'golab9; } if env.cursor >= env.limit { break 'lab0; } env.next_char(); } context.i_p2 = env.cursor; break 'lab0; } env.cursor = v_1; return true; } fn r_shortv(env: &mut SnowballEnv, context: &mut Context) -> bool { 'lab0: loop { let v_1 = env.limit - env.cursor; 'lab1: loop { if !env.out_grouping_b(G_v_WXY, 89, 121) { break 'lab1; } if !env.in_grouping_b(G_v, 97, 121) { break 'lab1; } if !env.out_grouping_b(G_v, 97, 121) { break 'lab1; } break 'lab0; } env.cursor = env.limit - v_1; if !env.out_grouping_b(G_v, 97, 121) { return false; } if !env.in_grouping_b(G_v, 97, 121) { return false; } if env.cursor > env.limit_backward { return false; } break 'lab0; } return true; } fn r_R1(env: &mut SnowballEnv, context: &mut Context) -> bool { if !(context.i_p1 <= env.cursor){ return false; } return true; } fn r_R2(env: &mut SnowballEnv, context: &mut Context) -> bool { if !(context.i_p2 <= env.cursor){ return false; } return true; } fn r_Step_1a(env: &mut SnowballEnv, context: &mut Context) -> bool { let mut among_var; let v_1 = env.limit - env.cursor; 'lab0: loop { env.ket = env.cursor; if env.find_among_b(A_1, context) == 0 { env.cursor = env.limit - v_1; break 'lab0; } env.bra = env.cursor; if !env.slice_del() { return false; } break 'lab0; } env.ket = env.cursor; among_var = env.find_among_b(A_2, context); if among_var == 0 { return false; } env.bra = env.cursor; if among_var == 1 { if !env.slice_from("ss") { return false; } } else if among_var == 2 { 'lab1: loop { let v_2 = env.limit - env.cursor; 'lab2: loop { if !env.hop_back(2) { break 'lab2; } if !env.slice_from("i") { return false; } break 'lab1; } env.cursor = env.limit - v_2; if !env.slice_from("ie") { return false; } break 'lab1; } } else if among_var == 3 { if env.cursor <= env.limit_backward { return false; } env.previous_char(); 'golab3: loop { 'lab4: loop { if !env.in_grouping_b(G_v, 97, 121) { break 'lab4; } break 'golab3; } if env.cursor <= env.limit_backward { return false; } env.previous_char(); } if !env.slice_del() { return false; } } return true; } fn r_Step_1b(env: &mut SnowballEnv, context: &mut Context) -> bool { let mut among_var; env.ket = env.cursor; among_var = env.find_among_b(A_4, context); if among_var == 0 { return false; } env.bra = env.cursor; if among_var == 1 { if !r_R1(env, context) { return false; } if !env.slice_from("ee") { return false; } } else if among_var == 2 { let v_1 = env.limit - env.cursor; 'golab0: loop { 'lab1: loop { if !env.in_grouping_b(G_v, 97, 121) { break 'lab1; } break 'golab0; } if env.cursor <= env.limit_backward { return false; } env.previous_char(); } env.cursor = env.limit - v_1; if !env.slice_del() { return false; } let v_3 = env.limit - env.cursor; among_var = env.find_among_b(A_3, context); if among_var == 0 { return false; } env.cursor = env.limit - v_3; if among_var == 1 { let c = env.cursor; let (bra, ket) = (env.cursor, env.cursor); env.insert(bra, ket, "e"); env.cursor = c; } else if among_var == 2 { env.ket = env.cursor; if env.cursor <= env.limit_backward { return false; } env.previous_char(); env.bra = env.cursor; if !env.slice_del() { return false; } } else if among_var == 3 { if env.cursor != context.i_p1 { return false; } let v_4 = env.limit - env.cursor; if !r_shortv(env, context) { return false; } env.cursor = env.limit - v_4; let c = env.cursor; let (bra, ket) = (env.cursor, env.cursor); env.insert(bra, ket, "e"); env.cursor = c; } } return true; } fn r_Step_1c(env: &mut SnowballEnv, context: &mut Context) -> bool { env.ket = env.cursor; 'lab0: loop { let v_1 = env.limit - env.cursor; 'lab1: loop { if !env.eq_s_b(&"y") { break 'lab1; } break 'lab0; } env.cursor = env.limit - v_1; if !env.eq_s_b(&"Y") { return false; } break 'lab0; } env.bra = env.cursor; if !env.out_grouping_b(G_v, 97, 121) { return false; } 'lab2: loop { if env.cursor > env.limit_backward { break 'lab2; } return false; } if !env.slice_from("i") { return false; } return true; } fn r_Step_2(env: &mut SnowballEnv, context: &mut Context) -> bool { let mut among_var; env.ket = env.cursor; among_var = env.find_among_b(A_5, context); if among_var == 0 { return false; } env.bra = env.cursor; if !r_R1(env, context) { return false; } if among_var == 1 { if !env.slice_from("tion") { return false; } } else if among_var == 2 { if !env.slice_from("ence") { return false; } } else if among_var == 3 { if !env.slice_from("ance") { return false; } } else if among_var == 4 { if !env.slice_from("able") { return false; } } else if among_var == 5 { if !env.slice_from("ent") { return false; } } else if among_var == 6 { if !env.slice_from("ize") { return false; } } else if among_var == 7 { if !env.slice_from("ate") { return false; } } else if among_var == 8 { if !env.slice_from("al") { return false; } } else if among_var == 9 { if !env.slice_from("ful") { return false; } } else if among_var == 10 { if !env.slice_from("ous") { return false; } } else if among_var == 11 { if !env.slice_from("ive") { return false; } } else if among_var == 12 { if !env.slice_from("ble") { return false; } } else if among_var == 13 { if !env.eq_s_b(&"l") { return false; } if !env.slice_from("og") { return false; } } else if among_var == 14 { if !env.slice_from("less") { return false; } } else if among_var == 15 { if !env.in_grouping_b(G_valid_LI, 99, 116) { return false; } if !env.slice_del() { return false; } } return true; } fn r_Step_3(env: &mut SnowballEnv, context: &mut Context) -> bool { let mut among_var; env.ket = env.cursor; among_var = env.find_among_b(A_6, context); if among_var == 0 { return false; } env.bra = env.cursor; if !r_R1(env, context) { return false; } if among_var == 1 { if !env.slice_from("tion") { return false; } } else if among_var == 2 { if !env.slice_from("ate") { return false; } } else if among_var == 3 { if !env.slice_from("al") { return false; } } else if among_var == 4 { if !env.slice_from("ic") { return false; } } else if among_var == 5 { if !env.slice_del() { return false; } } else if among_var == 6 { if !r_R2(env, context) { return false; } if !env.slice_del() { return false; } } return true; } fn r_Step_4(env: &mut SnowballEnv, context: &mut Context) -> bool { let mut among_var; env.ket = env.cursor; among_var = env.find_among_b(A_7, context); if among_var == 0 { return false; } env.bra = env.cursor; if !r_R2(env, context) { return false; } if among_var == 1 { if !env.slice_del() { return false; } } else if among_var == 2 { 'lab0: loop { let v_1 = env.limit - env.cursor; 'lab1: loop { if !env.eq_s_b(&"s") { break 'lab1; } break 'lab0; } env.cursor = env.limit - v_1; if !env.eq_s_b(&"t") { return false; } break 'lab0; } if !env.slice_del() { return false; } } return true; } fn r_Step_5(env: &mut SnowballEnv, context: &mut Context) -> bool { let mut among_var; env.ket = env.cursor; among_var = env.find_among_b(A_8, context); if among_var == 0 { return false; } env.bra = env.cursor; if among_var == 1 { 'lab0: loop { let v_1 = env.limit - env.cursor; 'lab1: loop { if !r_R2(env, context) { break 'lab1; } break 'lab0; } env.cursor = env.limit - v_1; if !r_R1(env, context) { return false; } let v_2 = env.limit - env.cursor; 'lab2: loop { if !r_shortv(env, context) { break 'lab2; } return false; } env.cursor = env.limit - v_2; break 'lab0; } if !env.slice_del() { return false; } } else if among_var == 2 { if !r_R2(env, context) { return false; } if !env.eq_s_b(&"l") { return false; } if !env.slice_del() { return false; } } return true; } fn r_exception2(env: &mut SnowballEnv, context: &mut Context) -> bool { env.ket = env.cursor; if env.find_among_b(A_9, context) == 0 { return false; } env.bra = env.cursor; if env.cursor > env.limit_backward { return false; } return true; } fn r_exception1(env: &mut SnowballEnv, context: &mut Context) -> bool { let mut among_var; env.bra = env.cursor; among_var = env.find_among(A_10, context); if among_var == 0 { return false; } env.ket = env.cursor; if env.cursor < env.limit { return false; } if among_var == 1 { if !env.slice_from("ski") { return false; } } else if among_var == 2 { if !env.slice_from("sky") { return false; } } else if among_var == 3 { if !env.slice_from("die") { return false; } } else if among_var == 4 { if !env.slice_from("lie") { return false; } } else if among_var == 5 { if !env.slice_from("tie") { return false; } } else if among_var == 6 { if !env.slice_from("idl") { return false; } } else if among_var == 7 { if !env.slice_from("gentl") { return false; } } else if among_var == 8 { if !env.slice_from("ugli") { return false; } } else if among_var == 9 { if !env.slice_from("earli") { return false; } } else if among_var == 10 { if !env.slice_from("onli") { return false; } } else if among_var == 11 { if !env.slice_from("singl") { return false; } } return true; } fn r_postlude(env: &mut SnowballEnv, context: &mut Context) -> bool { if !context.b_Y_found { return false; } 'replab0: loop{ let v_1 = env.cursor; 'lab1: for _ in 0..1 { 'golab2: loop { let v_2 = env.cursor; 'lab3: loop { env.bra = env.cursor; if !env.eq_s(&"Y") { break 'lab3; } env.ket = env.cursor; env.cursor = v_2; break 'golab2; } env.cursor = v_2; if env.cursor >= env.limit { break 'lab1; } env.next_char(); } if !env.slice_from("y") { return false; } continue 'replab0; } env.cursor = v_1; break 'replab0; } return true; } pub fn stem(env: &mut SnowballEnv) -> bool { let mut context = &mut Context { b_Y_found: false, i_p2: 0, i_p1: 0, }; 'lab0: loop { let v_1 = env.cursor; 'lab1: loop { if !r_exception1(env, context) { break 'lab1; } break 'lab0; } env.cursor = v_1; 'lab2: loop { let v_2 = env.cursor; 'lab3: loop { if !env.hop(3) { break 'lab3; } break 'lab2; } env.cursor = v_2; break 'lab0; } env.cursor = v_1; r_prelude(env, context); r_mark_regions(env, context); env.limit_backward = env.cursor; env.cursor = env.limit; let v_5 = env.limit - env.cursor; r_Step_1a(env, context); env.cursor = env.limit - v_5; 'lab4: loop { let v_6 = env.limit - env.cursor; 'lab5: loop { if !r_exception2(env, context) { break 'lab5; } break 'lab4; } env.cursor = env.limit - v_6; let v_7 = env.limit - env.cursor; r_Step_1b(env, context); env.cursor = env.limit - v_7; let v_8 = env.limit - env.cursor; r_Step_1c(env, context); env.cursor = env.limit - v_8; let v_9 = env.limit - env.cursor; r_Step_2(env, context); env.cursor = env.limit - v_9; let v_10 = env.limit - env.cursor; r_Step_3(env, context); env.cursor = env.limit - v_10; let v_11 = env.limit - env.cursor; r_Step_4(env, context); env.cursor = env.limit - v_11; let v_12 = env.limit - env.cursor; r_Step_5(env, context); env.cursor = env.limit - v_12; break 'lab4; } env.cursor = env.limit_backward; let v_13 = env.cursor; r_postlude(env, context); env.cursor = v_13; break 'lab0; } return true; } ================================================ FILE: src/snowball/algorithms/mod.rs ================================================ // Have a look at build.rs //include!(concat!(env!("OUT_DIR"), "/lang_include.rs")); pub mod english_stemmer; ================================================ FILE: src/snowball/among.rs ================================================ use crate::snowball::SnowballEnv; pub struct Among(pub &'static str, pub i32, pub i32, pub Option<&'static (dyn Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>); ================================================ FILE: src/snowball/mod.rs ================================================ // TODO: add Snowball license in here pub mod algorithms; mod among; mod snowball_env; // TODO: why do we need this `crate::`? pub use crate::snowball::among::Among; pub use crate::snowball::snowball_env::SnowballEnv; ================================================ FILE: src/snowball/snowball_env.rs ================================================ use std::borrow::Cow; use crate::snowball::Among; #[derive(Debug, Clone)] pub struct SnowballEnv<'a> { pub current: Cow<'a, str>, pub cursor: i32, pub limit: i32, pub limit_backward: i32, pub bra: i32, pub ket: i32, } impl<'a> SnowballEnv<'a> { pub fn create(value: &'a str) -> Self { let len = value.len(); SnowballEnv { current: Cow::from(value), cursor: 0, limit: len as i32, limit_backward: 0, bra: 0, ket: len as i32, } } pub fn get_current(self) -> Cow<'a, str> { self.current } pub fn set_current(&mut self, current: &'a str) { self.current = Cow::from(current); } pub fn set_current_s(&mut self, current: String) { self.current = Cow::from(current); } fn replace_s(&mut self, bra: i32, ket: i32, s: &str) -> i32 { let adjustment = s.len() as i32 - (ket - bra); let mut result = String::with_capacity(self.current.len()); { let (lhs, _) = self.current.split_at(bra as usize); let (_, rhs) = self.current.split_at(ket as usize); result.push_str(lhs); result.push_str(s); result.push_str(rhs); } // ... not very nice... let new_lim = self.limit + adjustment; self.limit = new_lim; if self.cursor >= ket { let new_cur = self.cursor + adjustment; self.cursor = new_cur; } else if self.cursor > bra { self.cursor = bra } self.current = Cow::from(result); adjustment } /// Check if s is after cursor. /// If so, move cursor to the end of s pub fn eq_s(&mut self, s: &str) -> bool { if self.cursor >= self.limit { return false; } if self.current[(self.cursor as usize)..].starts_with(s) { self.cursor += s.len() as i32; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor += 1; } true } else { false } } /// Check if 's' is before cursor /// If so, move cursor to the beginning of s pub fn eq_s_b(&mut self, s: &str) -> bool { if (self.cursor - self.limit_backward) < s.len() as i32 { false // Check if cursor -s.len is a char boundary. if not well... return false obv } else if !self.current.is_char_boundary(self.cursor as usize - s.len()) || !self.current[self.cursor as usize - s.len()..].starts_with(s) { false } else { self.cursor -= s.len() as i32; true } } /// Replace string between `bra` and `ket` with s pub fn slice_from(&mut self, s: &str) -> bool { let (bra, ket) = (self.bra, self.ket); self.replace_s(bra, ket, s); true } /// Move cursor to next character pub fn next_char(&mut self) { self.cursor += 1; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor += 1; } } /// Move cursor to previous character pub fn previous_char(&mut self) { self.cursor -= 1; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor -= 1; } } pub fn hop(&mut self, mut delta: i32) -> bool { let mut res = self.cursor; while delta > 0 { delta -= 1; if res >= self.limit { return false; } res += 1; while res < self.limit && !self.current.is_char_boundary(res as usize) { res += 1; } } self.cursor = res; return true; } pub fn hop_checked(&mut self, delta: i32) -> bool { return delta >= 0 && self.hop(delta); } pub fn hop_back(&mut self, mut delta: i32) -> bool { let mut res = self.cursor; while delta > 0 { delta -= 1; if res <= self.limit_backward { return false; } res -= 1; while res > self.limit_backward && !self.current.is_char_boundary(res as usize) { res -= 1; } } self.cursor = res; return true; } pub fn hop_back_checked(&mut self, delta: i32) -> bool { return delta >= 0 && self.hop_back(delta); } // A grouping is represented by a minimum code point, a maximum code point, // and a bitfield of which code points in that range are in the grouping. // For example, in english.sbl, valid_LI is 'cdeghkmnrt'. // The minimum and maximum code points are 99 and 116, // so every time one of these grouping functions is called for g_valid_LI, // min must be 99 and max must be 116. There are 18 code points within that // range (inclusive) so the grouping is represented with 18 bits, plus 6 bits of padding: // // cdefghij klmnopqr st // 11101100 10110001 01000000 // // The first bit is the least significant. // Those three bytes become &[0b00110111, 0b10001101, 0b00000010], // which is &[55, 141, 2], which is how g_valid_LI is defined in english.rs. /// Check if the char the cursor points to is in the grouping pub fn in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor >= self.limit { return false; } if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { return false; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { return false; } self.next_char(); return true; } return false; } pub fn in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor <= self.limit_backward { return false; } self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer self.next_char(); if ch > max || ch < min { return false; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { return false; } self.previous_char(); return true; } return false; } pub fn out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor >= self.limit { return false; } if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { self.next_char(); return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { self.next_char(); return true; } } return false; } pub fn out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor <= self.limit_backward { return false; } self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer self.next_char(); if ch > max || ch < min { self.previous_char(); return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { self.previous_char(); return true; } } return false; } /// Helper function that removes the string slice between `bra` and `ket` pub fn slice_del(&mut self) -> bool { self.slice_from("") } pub fn insert(&mut self, bra: i32, ket: i32, s: &str) { let adjustment = self.replace_s(bra, ket, s); if bra <= self.bra { self.bra = self.bra + adjustment; } if bra <= self.ket { self.ket = self.ket + adjustment; } } pub fn assign_to(&mut self) -> String { self.current[0..self.limit as usize].to_string() } pub fn slice_to(&mut self) -> String { self.current[self.bra as usize..self.ket as usize].to_string() } pub fn find_among(&mut self, amongs: &[Among], context: &mut T) -> i32 { use std::cmp::min; let mut i: i32 = 0; let mut j: i32 = amongs.len() as i32; let c = self.cursor; let l = self.limit; let mut common_i = 0i32; let mut common_j = 0i32; let mut first_key_inspected = false; loop { let k = i + ((j - i) >> 1); let mut diff: i32 = 0; let mut common = min(common_i, common_j); let w = &amongs[k as usize]; for lvar in common..w.0.len() as i32 { if c + common == l { diff = -1; break; } diff = self.current.as_bytes()[(c + common) as usize] as i32 - w.0.as_bytes()[lvar as usize] as i32; if diff != 0 { break; } common += 1; } if diff < 0 { j = k; common_j = common; } else { i = k; common_i = common; } if j - i <= 1 { if i > 0 { break; } if j == i { break; } if first_key_inspected { break; } first_key_inspected = true; } } loop { let w = &amongs[i as usize]; if common_i >= w.0.len() as i32{ self.cursor = c + w.0.len() as i32; if let Some(ref method) = w.3 { let res = method(self, context); self.cursor = c + w.0.len() as i32; if res { return w.2; } } else { return w.2; } } i = w.1; if i < 0 { return 0; } } } pub fn find_among_b(&mut self, amongs: &[Among], context: &mut T) -> i32 { let mut i: i32 = 0; let mut j: i32 = amongs.len() as i32; let c = self.cursor; let lb = self.limit_backward; let mut common_i = 0i32; let mut common_j = 0i32; let mut first_key_inspected = false; loop { let k = i + ((j - i) >> 1); let mut diff: i32 = 0; let mut common = if common_i < common_j { common_i } else { common_j }; let w = &amongs[k as usize]; for lvar in (0..w.0.len() - common as usize).rev() { if c - common == lb { diff = -1; break; } diff = self.current.as_bytes()[(c - common - 1) as usize] as i32 - w.0.as_bytes()[lvar] as i32; if diff != 0 { break; } // Count up commons. But not one character but the byte width of that char common += 1; } if diff < 0 { j = k; common_j = common; } else { i = k; common_i = common; } if j - i <= 1 { if i > 0 { break; } if j == i { break; } if first_key_inspected { break; } first_key_inspected = true; } } loop { let w = &amongs[i as usize]; if common_i >= w.0.len() as i32 { self.cursor = c - w.0.len() as i32; if let Some(ref method) = w.3 { let res = method(self, context); self.cursor = c - w.0.len() as i32; if res { return w.2; } } else { return w.2; } } i = w.1; if i < 0 { return 0; } } } }