Repository: tsoding/seroost
Branch: main
Commit: 94b6383b0e2b
Files: 16
Total size: 57.4 KB
Directory structure:
gitextract_z9nm0izk/
├── .gitignore
├── CONTRIBUTING.md
├── Cargo.toml
├── LICENSE
├── README.md
└── src/
├── index.html
├── index.js
├── lexer.rs
├── main.rs
├── model.rs
├── server.rs
└── snowball/
├── algorithms/
│ ├── english_stemmer.rs
│ └── mod.rs
├── among.rs
├── mod.rs
└── snowball_env.rs
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
/target
================================================
FILE: CONTRIBUTING.md
================================================
I have very limited resources in terms of handling feedback on my projects, sorry. So here are the limitations to keep in mind:
- I don't look into reported Issues.
- I only look into small PRs that suggest
- bug fixes,
- documentation fixes.
- I do not look into PRs that
- implement new features,
- refactor/cleanup the code.
- What qualifies as a bug, a feature, or refactoring is entirely upon my interpretation.
Sorry for any inconveniences. If you want to stir the project in a particular direction in terms of features feel free to fork it, I don't mind. Just make sure you have fun while developing it! This is like the whole point!
================================================
FILE: Cargo.toml
================================================
[package]
name = "seroost"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
poppler-rs = "0.21.0"
serde = { version = "1.0.152", features = ["derive"] }
serde_json = "1.0.91"
tiny_http = "0.12.0"
xml-rs = "0.8.4"
================================================
FILE: LICENSE
================================================
Copyright 2023 Alexey Kutepov <reximkut@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: README.md
================================================
# Local Search Engine in Rust
**THIS SOFTWARE IS UNFINISHED!!! Don't have any high expectations.**
## Quick Start
```console
$ cargo run serve ./folder/
$ iexplore.exe http://localhost:6969/
```
================================================
FILE: src/index.html
================================================
<html>
<head>
<title>Seroost</title>
</head>
<body>
<h1>Provide Your Query:</h1>
<input id="query" type="text" />
<div id="results"></div>
<script src="index.js"></script>
</body>
</html>
================================================
FILE: src/index.js
================================================
// TODO: live update results as you type
async function search(prompt) {
const results = document.getElementById("results")
results.innerHTML = "";
const response = await fetch("/api/search", {
method: 'POST',
headers: {'Content-Type': 'text/plain'},
body: prompt,
});
const json = await response.json();
results.innerHTML = "";
for ([path, rank] of json) {
let item = document.createElement("span");
item.appendChild(document.createTextNode(path));
item.appendChild(document.createElement("br"));
results.appendChild(item);
}
}
let query = document.getElementById("query");
let currentSearch = Promise.resolve()
query.addEventListener("keypress", (e) => {
if (e.key == "Enter") {
currentSearch.then(() => search(query.value));
}
})
================================================
FILE: src/lexer.rs
================================================
pub struct Lexer<'a> {
content: &'a [char],
}
impl<'a> Lexer<'a> {
pub fn new(content: &'a [char]) -> Self {
Self { content }
}
fn trim_left(&mut self) {
while !self.content.is_empty() && self.content[0].is_whitespace() {
self.content = &self.content[1..];
}
}
fn chop(&mut self, n: usize) -> &'a [char] {
let token = &self.content[0..n];
self.content = &self.content[n..];
token
}
fn chop_while<P>(&mut self, mut predicate: P) -> &'a [char] where P: FnMut(&char) -> bool {
let mut n = 0;
while n < self.content.len() && predicate(&self.content[n]) {
n += 1;
}
self.chop(n)
}
pub fn next_token(&mut self) -> Option<String> {
self.trim_left();
if self.content.is_empty() {
return None
}
if self.content[0].is_numeric() {
return Some(self.chop_while(|x| x.is_numeric()).iter().collect());
}
if self.content[0].is_alphabetic() {
let term = self.chop_while(|x| x.is_alphanumeric()).iter().map(|x| x.to_ascii_lowercase()).collect::<String>();
let mut env = crate::snowball::SnowballEnv::create(&term);
crate::snowball::algorithms::english_stemmer::stem(&mut env);
let stemmed_term = env.get_current().to_string();
return Some(stemmed_term);
}
return Some(self.chop(1).iter().collect());
}
}
impl<'a> Iterator for Lexer<'a> {
type Item = String;
fn next(&mut self) -> Option<Self::Item> {
self.next_token()
}
}
================================================
FILE: src/main.rs
================================================
use std::fs::{self, File};
use std::path::{Path};
use xml::reader::{XmlEvent, EventReader};
use xml::common::{Position, TextPosition};
use std::env;
use std::result::Result;
use std::process::ExitCode;
use std::str;
use std::io::{BufReader, BufWriter};
use std::sync::{Arc, Mutex};
use std::thread;
mod model;
use model::*;
mod server;
mod lexer;
pub mod snowball;
fn parse_entire_txt_file(file_path: &Path) -> Result<String, ()> {
fs::read_to_string(file_path).map_err(|err| {
eprintln!("ERROR: coult not open file {file_path}: {err}", file_path = file_path.display());
})
}
fn parse_entire_pdf_file(file_path: &Path) -> Result<String, ()> {
use poppler::Document;
use std::io::Read;
let mut content = Vec::new();
File::open(file_path)
.and_then(|mut file| file.read_to_end(&mut content))
.map_err(|err| {
eprintln!("ERROR: could not read file {file_path}: {err}", file_path = file_path.display());
})?;
let pdf = Document::from_data(&content, None).map_err(|err| {
eprintln!("ERROR: could not read file {file_path}: {err}",
file_path = file_path.display());
})?;
let mut result = String::new();
let n = pdf.n_pages();
for i in 0..n {
let page = pdf.page(i).expect(&format!("{i} is within the bounds of the range of the page"));
if let Some(content) = page.text() {
result.push_str(content.as_str());
result.push(' ');
}
}
Ok(result)
}
fn parse_entire_xml_file(file_path: &Path) -> Result<String, ()> {
let file = File::open(file_path).map_err(|err| {
eprintln!("ERROR: could not open file {file_path}: {err}", file_path = file_path.display());
})?;
let er = EventReader::new(BufReader::new(file));
let mut content = String::new();
for event in er.into_iter() {
let event = event.map_err(|err| {
let TextPosition {row, column} = err.position();
let msg = err.msg();
eprintln!("{file_path}:{row}:{column}: ERROR: {msg}", file_path = file_path.display());
})?;
if let XmlEvent::Characters(text) = event {
content.push_str(&text);
content.push(' ');
}
}
Ok(content)
}
fn parse_entire_file_by_extension(file_path: &Path) -> Result<String, ()> {
let extension = file_path.extension().ok_or_else(|| {
eprintln!("ERROR: can't detect file type of {file_path} without extension",
file_path = file_path.display());
})?.to_string_lossy();
match extension.as_ref() {
"xhtml" | "xml" => parse_entire_xml_file(file_path),
// TODO: specialized parser for markdown files
"txt" | "md" => parse_entire_txt_file(file_path),
"pdf" => parse_entire_pdf_file(file_path),
_ => {
eprintln!("ERROR: can't detect file type of {file_path}: unsupported extension {extension}",
file_path = file_path.display(),
extension = extension);
Err(())
}
}
}
fn save_model_as_json(model: &Model, index_path: &Path) -> Result<(), ()> {
println!("Saving {index_path}...", index_path = index_path.display());
let index_file = File::create(index_path).map_err(|err| {
eprintln!("ERROR: could not create index file {index_path}: {err}",
index_path = index_path.display());
})?;
serde_json::to_writer(BufWriter::new(index_file), &model).map_err(|err| {
eprintln!("ERROR: could not serialize index into file {index_path}: {err}",
index_path = index_path.display());
})?;
Ok(())
}
fn add_folder_to_model(dir_path: &Path, model: Arc<Mutex<Model>>, processed: &mut usize) -> Result<(), ()> {
let dir = fs::read_dir(dir_path).map_err(|err| {
eprintln!("ERROR: could not open directory {dir_path} for indexing: {err}",
dir_path = dir_path.display());
})?;
'next_file: for file in dir {
let file = file.map_err(|err| {
eprintln!("ERROR: could not read next file in directory {dir_path} during indexing: {err}",
dir_path = dir_path.display());
})?;
let file_path = file.path();
let dot_file = file_path
.file_name()
.and_then(|s| s.to_str())
.map(|s| s.starts_with("."))
.unwrap_or(false);
if dot_file {
continue 'next_file;
}
let file_type = file.file_type().map_err(|err| {
eprintln!("ERROR: could not determine type of file {file_path}: {err}",
file_path = file_path.display());
})?;
let last_modified = file.metadata().map_err(|err| {
eprintln!("ERROR: could not get the metadata of file {file_path}: {err}",
file_path = file_path.display());
})?.modified().map_err(|err| {
eprintln!("ERROR: could not get the last modification date of file {file_path}: {err}",
file_path = file_path.display())
})?;
if file_type.is_dir() {
add_folder_to_model(&file_path, Arc::clone(&model), processed)?;
continue 'next_file;
}
// TODO: how does this work with symlinks?
let mut model = model.lock().unwrap();
if model.requires_reindexing(&file_path, last_modified) {
println!("Indexing {:?}...", &file_path);
let content = match parse_entire_file_by_extension(&file_path) {
Ok(content) => content.chars().collect::<Vec<_>>(),
// TODO: still add the skipped files to the model to prevent their reindexing in the future
Err(()) => continue 'next_file,
};
model.add_document(file_path, last_modified, &content);
*processed += 1;
}
}
Ok(())
}
fn usage(program: &str) {
eprintln!("Usage: {program} [SUBCOMMAND] [OPTIONS]");
eprintln!("Subcommands:");
eprintln!(" serve <folder> [address] start local HTTP server with Web Interface");
}
fn entry() -> Result<(), ()> {
let mut args = env::args();
let program = args.next().expect("path to program is provided");
let subcommand = args.next().ok_or_else(|| {
usage(&program);
eprintln!("ERROR: no subcommand is provided");
})?;
match subcommand.as_str() {
"serve" => {
let dir_path = args.next().ok_or_else(|| {
usage(&program);
eprintln!("ERROR: no directory is provided for {subcommand} subcommand");
})?;
let mut index_path = Path::new(&dir_path).to_path_buf();
index_path.push(".seroost.json");
let address = args.next().unwrap_or("127.0.0.1:6969".to_string());
let exists = index_path.try_exists().map_err(|err| {
eprintln!("ERROR: could not check the existence of file {index_path}: {err}",
index_path = index_path.display());
})?;
let model: Arc<Mutex<Model>>;
if exists {
let index_file = File::open(&index_path).map_err(|err| {
eprintln!("ERROR: could not open index file {index_path}: {err}",
index_path = index_path.display());
})?;
model = Arc::new(Mutex::new(serde_json::from_reader(index_file).map_err(|err| {
eprintln!("ERROR: could not parse index file {index_path}: {err}",
index_path = index_path.display());
})?));
} else {
model = Arc::new(Mutex::new(Default::default()));
}
{
let model = Arc::clone(&model);
thread::spawn(move || {
let mut processed = 0;
// TODO: what should we do in case indexing thread crashes
add_folder_to_model(Path::new(&dir_path), Arc::clone(&model), &mut processed).unwrap();
if processed > 0 {
let model = model.lock().unwrap();
save_model_as_json(&model, &index_path).unwrap();
}
println!("Finished indexing");
});
}
server::start(&address, Arc::clone(&model))
}
_ => {
usage(&program);
eprintln!("ERROR: unknown subcommand {subcommand}");
Err(())
}
}
}
fn main() -> ExitCode {
match entry() {
Ok(()) => ExitCode::SUCCESS,
Err(()) => ExitCode::FAILURE,
}
}
// TODO: search result must consist of clickable links
// TODO: synonym terms
================================================
FILE: src/model.rs
================================================
use std::collections::HashMap;
use std::path::{PathBuf, Path};
use serde::{Deserialize, Serialize};
use super::lexer::Lexer;
use std::time::SystemTime;
type DocFreq = HashMap<String, usize>;
type TermFreq = HashMap<String, usize>;
#[derive(Deserialize, Serialize)]
pub struct Doc {
tf: TermFreq,
count: usize,
// TODO: make sure that the serde serialization of SystemTime also work on other platforms
last_modified: SystemTime,
}
type Docs = HashMap<PathBuf, Doc>;
#[derive(Default, Deserialize, Serialize)]
pub struct Model {
pub docs: Docs,
pub df: DocFreq,
}
impl Model {
fn remove_document(&mut self, file_path: &Path) {
if let Some(doc) = self.docs.remove(file_path) {
for t in doc.tf.keys() {
if let Some(f) = self.df.get_mut(t) {
*f -= 1;
}
}
}
}
pub fn requires_reindexing(&mut self, file_path: &Path, last_modified: SystemTime) -> bool {
if let Some(doc) = self.docs.get(file_path) {
return doc.last_modified < last_modified;
}
return true;
}
pub fn search_query(&self, query: &[char]) -> Vec<(PathBuf, f32)> {
let mut result = Vec::new();
let tokens = Lexer::new(&query).collect::<Vec<_>>();
for (path, doc) in &self.docs {
let mut rank = 0f32;
for token in &tokens {
rank += compute_tf(token, doc) * compute_idf(&token, self.docs.len(), &self.df);
}
// TODO: investigate the sources of NaN
if !rank.is_nan() {
result.push((path.clone(), rank));
}
}
result.sort_by(|(_, rank1), (_, rank2)| rank1.partial_cmp(rank2).expect(&format!("{rank1} and {rank2} are not comparable")));
result.reverse();
result
}
pub fn add_document(&mut self, file_path: PathBuf, last_modified: SystemTime, content: &[char]) {
self.remove_document(&file_path);
let mut tf = TermFreq::new();
let mut count = 0;
for t in Lexer::new(content) {
if let Some(f) = tf.get_mut(&t) {
*f += 1;
} else {
tf.insert(t, 1);
}
count += 1;
}
for t in tf.keys() {
if let Some(f) = self.df.get_mut(t) {
*f += 1;
} else {
self.df.insert(t.to_string(), 1);
}
}
self.docs.insert(file_path, Doc {count, tf, last_modified});
}
}
fn compute_tf(t: &str, doc: &Doc) -> f32 {
let n = doc.count as f32;
let m = doc.tf.get(t).cloned().unwrap_or(0) as f32;
m / n
}
fn compute_idf(t: &str, n: usize, df: &DocFreq) -> f32 {
let n = n as f32;
let m = df.get(t).cloned().unwrap_or(1) as f32;
(n / m).log10()
}
================================================
FILE: src/server.rs
================================================
use std::str;
use std::io;
use std::sync::{Arc, Mutex};
use super::model::*;
use tiny_http::{Server, Request, Response, Header, Method, StatusCode};
fn serve_404(request: Request) -> io::Result<()> {
request.respond(Response::from_string("404").with_status_code(StatusCode(404)))
}
fn serve_500(request: Request) -> io::Result<()> {
request.respond(Response::from_string("500").with_status_code(StatusCode(500)))
}
fn serve_400(request: Request, message: &str) -> io::Result<()> {
request.respond(Response::from_string(format!("400: {message}")).with_status_code(StatusCode(400)))
}
fn serve_bytes(request: Request, bytes: &[u8], content_type: &str) -> io::Result<()> {
let content_type_header = Header::from_bytes("Content-Type", content_type)
.expect("That we didn't put any garbage in the headers");
request.respond(Response::from_data(bytes).with_header(content_type_header))
}
// TODO: the errors of serve_api_search should probably return JSON
// 'Cause that's what expected from them.
fn serve_api_search(model: Arc<Mutex<Model>>, mut request: Request) -> io::Result<()> {
let mut buf = Vec::new();
if let Err(err) = request.as_reader().read_to_end(&mut buf) {
eprintln!("ERROR: could not read the body of the request: {err}");
return serve_500(request);
}
let body = match str::from_utf8(&buf) {
Ok(body) => body.chars().collect::<Vec<_>>(),
Err(err) => {
eprintln!("ERROR: could not interpret body as UTF-8 string: {err}");
return serve_400(request, "Body must be a valid UTF-8 string");
}
};
let model = model.lock().unwrap();
let result = model.search_query(&body);
let json = match serde_json::to_string(&result.iter().take(20).collect::<Vec<_>>()) {
Ok(json) => json,
Err(err) => {
eprintln!("ERROR: could not convert search results to JSON: {err}");
return serve_500(request)
}
};
let content_type_header = Header::from_bytes("Content-Type", "application/json")
.expect("That we didn't put any garbage in the headers");
request.respond(Response::from_string(&json).with_header(content_type_header))
}
fn serve_api_stats(model: Arc<Mutex<Model>>, request: Request) -> io::Result<()> {
use serde::Serialize;
#[derive(Default, Serialize)]
struct Stats {
docs_count: usize,
terms_count: usize,
}
let mut stats: Stats = Default::default();
{
let model = model.lock().unwrap();
stats.docs_count = model.docs.len();
stats.terms_count = model.df.len();
}
let json = match serde_json::to_string(&stats) {
Ok(json) => json,
Err(err) => {
eprintln!("ERROR: could not convert stats results to JSON: {err}");
return serve_500(request)
}
};
let content_type_header = Header::from_bytes("Content-Type", "application/json")
.expect("That we didn't put any garbage in the headers");
request.respond(Response::from_string(&json).with_header(content_type_header))
}
fn serve_request(model: Arc<Mutex<Model>>, request: Request) -> io::Result<()> {
println!("INFO: received request! method: {:?}, url: {:?}", request.method(), request.url());
match (request.method(), request.url()) {
(Method::Post, "/api/search") => {
serve_api_search(model, request)
}
(Method::Get, "/api/stats") => {
serve_api_stats(model, request)
}
(Method::Get, "/index.js") => {
serve_bytes(request, include_bytes!("index.js"), "text/javascript; charset=utf-8")
}
(Method::Get, "/") | (Method::Get, "/index.html") => {
serve_bytes(request, include_bytes!("index.html"), "text/html; charset=utf-8")
}
_ => {
serve_404(request)
}
}
}
pub fn start(address: &str, model: Arc<Mutex<Model>>) -> Result<(), ()> {
let server = Server::http(&address).map_err(|err| {
eprintln!("ERROR: could not start HTTP server at {address}: {err}");
})?;
println!("INFO: listening at http://{address}/");
for request in server.incoming_requests() {
serve_request(Arc::clone(&model), request).map_err(|err| {
eprintln!("ERROR: could not serve the response: {err}");
}).ok(); // <- don't stop on errors, keep serving
}
eprintln!("ERROR: the server socket has shutdown");
Err(())
}
================================================
FILE: src/snowball/algorithms/english_stemmer.rs
================================================
//! Generated by Snowball 2.2.0 - https://snowballstem.org/
#![allow(non_snake_case)]
#![allow(non_upper_case_globals)]
#![allow(unused_mut)]
#![allow(unused_parens)]
#![allow(unused_variables)]
use crate::snowball::SnowballEnv;
use crate::snowball::Among;
static A_0: &'static [Among<Context>; 3] = &[
Among("arsen", -1, -1, None),
Among("commun", -1, -1, None),
Among("gener", -1, -1, None),
];
static A_1: &'static [Among<Context>; 3] = &[
Among("'", -1, 1, None),
Among("'s'", 0, 1, None),
Among("'s", -1, 1, None),
];
static A_2: &'static [Among<Context>; 6] = &[
Among("ied", -1, 2, None),
Among("s", -1, 3, None),
Among("ies", 1, 2, None),
Among("sses", 1, 1, None),
Among("ss", 1, -1, None),
Among("us", 1, -1, None),
];
static A_3: &'static [Among<Context>; 13] = &[
Among("", -1, 3, None),
Among("bb", 0, 2, None),
Among("dd", 0, 2, None),
Among("ff", 0, 2, None),
Among("gg", 0, 2, None),
Among("bl", 0, 1, None),
Among("mm", 0, 2, None),
Among("nn", 0, 2, None),
Among("pp", 0, 2, None),
Among("rr", 0, 2, None),
Among("at", 0, 1, None),
Among("tt", 0, 2, None),
Among("iz", 0, 1, None),
];
static A_4: &'static [Among<Context>; 6] = &[
Among("ed", -1, 2, None),
Among("eed", 0, 1, None),
Among("ing", -1, 2, None),
Among("edly", -1, 2, None),
Among("eedly", 3, 1, None),
Among("ingly", -1, 2, None),
];
static A_5: &'static [Among<Context>; 24] = &[
Among("anci", -1, 3, None),
Among("enci", -1, 2, None),
Among("ogi", -1, 13, None),
Among("li", -1, 15, None),
Among("bli", 3, 12, None),
Among("abli", 4, 4, None),
Among("alli", 3, 8, None),
Among("fulli", 3, 9, None),
Among("lessli", 3, 14, None),
Among("ousli", 3, 10, None),
Among("entli", 3, 5, None),
Among("aliti", -1, 8, None),
Among("biliti", -1, 12, None),
Among("iviti", -1, 11, None),
Among("tional", -1, 1, None),
Among("ational", 14, 7, None),
Among("alism", -1, 8, None),
Among("ation", -1, 7, None),
Among("ization", 17, 6, None),
Among("izer", -1, 6, None),
Among("ator", -1, 7, None),
Among("iveness", -1, 11, None),
Among("fulness", -1, 9, None),
Among("ousness", -1, 10, None),
];
static A_6: &'static [Among<Context>; 9] = &[
Among("icate", -1, 4, None),
Among("ative", -1, 6, None),
Among("alize", -1, 3, None),
Among("iciti", -1, 4, None),
Among("ical", -1, 4, None),
Among("tional", -1, 1, None),
Among("ational", 5, 2, None),
Among("ful", -1, 5, None),
Among("ness", -1, 5, None),
];
static A_7: &'static [Among<Context>; 18] = &[
Among("ic", -1, 1, None),
Among("ance", -1, 1, None),
Among("ence", -1, 1, None),
Among("able", -1, 1, None),
Among("ible", -1, 1, None),
Among("ate", -1, 1, None),
Among("ive", -1, 1, None),
Among("ize", -1, 1, None),
Among("iti", -1, 1, None),
Among("al", -1, 1, None),
Among("ism", -1, 1, None),
Among("ion", -1, 2, None),
Among("er", -1, 1, None),
Among("ous", -1, 1, None),
Among("ant", -1, 1, None),
Among("ent", -1, 1, None),
Among("ment", 15, 1, None),
Among("ement", 16, 1, None),
];
static A_8: &'static [Among<Context>; 2] = &[
Among("e", -1, 1, None),
Among("l", -1, 2, None),
];
static A_9: &'static [Among<Context>; 8] = &[
Among("succeed", -1, -1, None),
Among("proceed", -1, -1, None),
Among("exceed", -1, -1, None),
Among("canning", -1, -1, None),
Among("inning", -1, -1, None),
Among("earring", -1, -1, None),
Among("herring", -1, -1, None),
Among("outing", -1, -1, None),
];
static A_10: &'static [Among<Context>; 18] = &[
Among("andes", -1, -1, None),
Among("atlas", -1, -1, None),
Among("bias", -1, -1, None),
Among("cosmos", -1, -1, None),
Among("dying", -1, 3, None),
Among("early", -1, 9, None),
Among("gently", -1, 7, None),
Among("howe", -1, -1, None),
Among("idly", -1, 6, None),
Among("lying", -1, 4, None),
Among("news", -1, -1, None),
Among("only", -1, 10, None),
Among("singly", -1, 11, None),
Among("skies", -1, 2, None),
Among("skis", -1, 1, None),
Among("sky", -1, -1, None),
Among("tying", -1, 5, None),
Among("ugly", -1, 8, None),
];
static G_v: &'static [u8; 4] = &[17, 65, 16, 1];
static G_v_WXY: &'static [u8; 5] = &[1, 17, 65, 208, 1];
static G_valid_LI: &'static [u8; 3] = &[55, 141, 2];
#[derive(Clone)]
struct Context {
b_Y_found: bool,
i_p2: i32,
i_p1: i32,
}
fn r_prelude(env: &mut SnowballEnv, context: &mut Context) -> bool {
context.b_Y_found = false;
let v_1 = env.cursor;
'lab0: loop {
env.bra = env.cursor;
if !env.eq_s(&"'") {
break 'lab0;
}
env.ket = env.cursor;
if !env.slice_del() {
return false;
}
break 'lab0;
}
env.cursor = v_1;
let v_2 = env.cursor;
'lab1: loop {
env.bra = env.cursor;
if !env.eq_s(&"y") {
break 'lab1;
}
env.ket = env.cursor;
if !env.slice_from("Y") {
return false;
}
context.b_Y_found = true;
break 'lab1;
}
env.cursor = v_2;
let v_3 = env.cursor;
'lab2: loop {
'replab3: loop{
let v_4 = env.cursor;
'lab4: for _ in 0..1 {
'golab5: loop {
let v_5 = env.cursor;
'lab6: loop {
if !env.in_grouping(G_v, 97, 121) {
break 'lab6;
}
env.bra = env.cursor;
if !env.eq_s(&"y") {
break 'lab6;
}
env.ket = env.cursor;
env.cursor = v_5;
break 'golab5;
}
env.cursor = v_5;
if env.cursor >= env.limit {
break 'lab4;
}
env.next_char();
}
if !env.slice_from("Y") {
return false;
}
context.b_Y_found = true;
continue 'replab3;
}
env.cursor = v_4;
break 'replab3;
}
break 'lab2;
}
env.cursor = v_3;
return true;
}
fn r_mark_regions(env: &mut SnowballEnv, context: &mut Context) -> bool {
context.i_p1 = env.limit;
context.i_p2 = env.limit;
let v_1 = env.cursor;
'lab0: loop {
'lab1: loop {
let v_2 = env.cursor;
'lab2: loop {
if env.find_among(A_0, context) == 0 {
break 'lab2;
}
break 'lab1;
}
env.cursor = v_2;
'golab3: loop {
'lab4: loop {
if !env.in_grouping(G_v, 97, 121) {
break 'lab4;
}
break 'golab3;
}
if env.cursor >= env.limit {
break 'lab0;
}
env.next_char();
}
'golab5: loop {
'lab6: loop {
if !env.out_grouping(G_v, 97, 121) {
break 'lab6;
}
break 'golab5;
}
if env.cursor >= env.limit {
break 'lab0;
}
env.next_char();
}
break 'lab1;
}
context.i_p1 = env.cursor;
'golab7: loop {
'lab8: loop {
if !env.in_grouping(G_v, 97, 121) {
break 'lab8;
}
break 'golab7;
}
if env.cursor >= env.limit {
break 'lab0;
}
env.next_char();
}
'golab9: loop {
'lab10: loop {
if !env.out_grouping(G_v, 97, 121) {
break 'lab10;
}
break 'golab9;
}
if env.cursor >= env.limit {
break 'lab0;
}
env.next_char();
}
context.i_p2 = env.cursor;
break 'lab0;
}
env.cursor = v_1;
return true;
}
fn r_shortv(env: &mut SnowballEnv, context: &mut Context) -> bool {
'lab0: loop {
let v_1 = env.limit - env.cursor;
'lab1: loop {
if !env.out_grouping_b(G_v_WXY, 89, 121) {
break 'lab1;
}
if !env.in_grouping_b(G_v, 97, 121) {
break 'lab1;
}
if !env.out_grouping_b(G_v, 97, 121) {
break 'lab1;
}
break 'lab0;
}
env.cursor = env.limit - v_1;
if !env.out_grouping_b(G_v, 97, 121) {
return false;
}
if !env.in_grouping_b(G_v, 97, 121) {
return false;
}
if env.cursor > env.limit_backward {
return false;
}
break 'lab0;
}
return true;
}
fn r_R1(env: &mut SnowballEnv, context: &mut Context) -> bool {
if !(context.i_p1 <= env.cursor){
return false;
}
return true;
}
fn r_R2(env: &mut SnowballEnv, context: &mut Context) -> bool {
if !(context.i_p2 <= env.cursor){
return false;
}
return true;
}
fn r_Step_1a(env: &mut SnowballEnv, context: &mut Context) -> bool {
let mut among_var;
let v_1 = env.limit - env.cursor;
'lab0: loop {
env.ket = env.cursor;
if env.find_among_b(A_1, context) == 0 {
env.cursor = env.limit - v_1;
break 'lab0;
}
env.bra = env.cursor;
if !env.slice_del() {
return false;
}
break 'lab0;
}
env.ket = env.cursor;
among_var = env.find_among_b(A_2, context);
if among_var == 0 {
return false;
}
env.bra = env.cursor;
if among_var == 1 {
if !env.slice_from("ss") {
return false;
}
} else if among_var == 2 {
'lab1: loop {
let v_2 = env.limit - env.cursor;
'lab2: loop {
if !env.hop_back(2) {
break 'lab2;
}
if !env.slice_from("i") {
return false;
}
break 'lab1;
}
env.cursor = env.limit - v_2;
if !env.slice_from("ie") {
return false;
}
break 'lab1;
}
} else if among_var == 3 {
if env.cursor <= env.limit_backward {
return false;
}
env.previous_char();
'golab3: loop {
'lab4: loop {
if !env.in_grouping_b(G_v, 97, 121) {
break 'lab4;
}
break 'golab3;
}
if env.cursor <= env.limit_backward {
return false;
}
env.previous_char();
}
if !env.slice_del() {
return false;
}
}
return true;
}
fn r_Step_1b(env: &mut SnowballEnv, context: &mut Context) -> bool {
let mut among_var;
env.ket = env.cursor;
among_var = env.find_among_b(A_4, context);
if among_var == 0 {
return false;
}
env.bra = env.cursor;
if among_var == 1 {
if !r_R1(env, context) {
return false;
}
if !env.slice_from("ee") {
return false;
}
} else if among_var == 2 {
let v_1 = env.limit - env.cursor;
'golab0: loop {
'lab1: loop {
if !env.in_grouping_b(G_v, 97, 121) {
break 'lab1;
}
break 'golab0;
}
if env.cursor <= env.limit_backward {
return false;
}
env.previous_char();
}
env.cursor = env.limit - v_1;
if !env.slice_del() {
return false;
}
let v_3 = env.limit - env.cursor;
among_var = env.find_among_b(A_3, context);
if among_var == 0 {
return false;
}
env.cursor = env.limit - v_3;
if among_var == 1 {
let c = env.cursor;
let (bra, ket) = (env.cursor, env.cursor);
env.insert(bra, ket, "e");
env.cursor = c;
} else if among_var == 2 {
env.ket = env.cursor;
if env.cursor <= env.limit_backward {
return false;
}
env.previous_char();
env.bra = env.cursor;
if !env.slice_del() {
return false;
}
} else if among_var == 3 {
if env.cursor != context.i_p1 {
return false;
}
let v_4 = env.limit - env.cursor;
if !r_shortv(env, context) {
return false;
}
env.cursor = env.limit - v_4;
let c = env.cursor;
let (bra, ket) = (env.cursor, env.cursor);
env.insert(bra, ket, "e");
env.cursor = c;
}
}
return true;
}
fn r_Step_1c(env: &mut SnowballEnv, context: &mut Context) -> bool {
env.ket = env.cursor;
'lab0: loop {
let v_1 = env.limit - env.cursor;
'lab1: loop {
if !env.eq_s_b(&"y") {
break 'lab1;
}
break 'lab0;
}
env.cursor = env.limit - v_1;
if !env.eq_s_b(&"Y") {
return false;
}
break 'lab0;
}
env.bra = env.cursor;
if !env.out_grouping_b(G_v, 97, 121) {
return false;
}
'lab2: loop {
if env.cursor > env.limit_backward {
break 'lab2;
}
return false;
}
if !env.slice_from("i") {
return false;
}
return true;
}
fn r_Step_2(env: &mut SnowballEnv, context: &mut Context) -> bool {
let mut among_var;
env.ket = env.cursor;
among_var = env.find_among_b(A_5, context);
if among_var == 0 {
return false;
}
env.bra = env.cursor;
if !r_R1(env, context) {
return false;
}
if among_var == 1 {
if !env.slice_from("tion") {
return false;
}
} else if among_var == 2 {
if !env.slice_from("ence") {
return false;
}
} else if among_var == 3 {
if !env.slice_from("ance") {
return false;
}
} else if among_var == 4 {
if !env.slice_from("able") {
return false;
}
} else if among_var == 5 {
if !env.slice_from("ent") {
return false;
}
} else if among_var == 6 {
if !env.slice_from("ize") {
return false;
}
} else if among_var == 7 {
if !env.slice_from("ate") {
return false;
}
} else if among_var == 8 {
if !env.slice_from("al") {
return false;
}
} else if among_var == 9 {
if !env.slice_from("ful") {
return false;
}
} else if among_var == 10 {
if !env.slice_from("ous") {
return false;
}
} else if among_var == 11 {
if !env.slice_from("ive") {
return false;
}
} else if among_var == 12 {
if !env.slice_from("ble") {
return false;
}
} else if among_var == 13 {
if !env.eq_s_b(&"l") {
return false;
}
if !env.slice_from("og") {
return false;
}
} else if among_var == 14 {
if !env.slice_from("less") {
return false;
}
} else if among_var == 15 {
if !env.in_grouping_b(G_valid_LI, 99, 116) {
return false;
}
if !env.slice_del() {
return false;
}
}
return true;
}
fn r_Step_3(env: &mut SnowballEnv, context: &mut Context) -> bool {
let mut among_var;
env.ket = env.cursor;
among_var = env.find_among_b(A_6, context);
if among_var == 0 {
return false;
}
env.bra = env.cursor;
if !r_R1(env, context) {
return false;
}
if among_var == 1 {
if !env.slice_from("tion") {
return false;
}
} else if among_var == 2 {
if !env.slice_from("ate") {
return false;
}
} else if among_var == 3 {
if !env.slice_from("al") {
return false;
}
} else if among_var == 4 {
if !env.slice_from("ic") {
return false;
}
} else if among_var == 5 {
if !env.slice_del() {
return false;
}
} else if among_var == 6 {
if !r_R2(env, context) {
return false;
}
if !env.slice_del() {
return false;
}
}
return true;
}
fn r_Step_4(env: &mut SnowballEnv, context: &mut Context) -> bool {
let mut among_var;
env.ket = env.cursor;
among_var = env.find_among_b(A_7, context);
if among_var == 0 {
return false;
}
env.bra = env.cursor;
if !r_R2(env, context) {
return false;
}
if among_var == 1 {
if !env.slice_del() {
return false;
}
} else if among_var == 2 {
'lab0: loop {
let v_1 = env.limit - env.cursor;
'lab1: loop {
if !env.eq_s_b(&"s") {
break 'lab1;
}
break 'lab0;
}
env.cursor = env.limit - v_1;
if !env.eq_s_b(&"t") {
return false;
}
break 'lab0;
}
if !env.slice_del() {
return false;
}
}
return true;
}
fn r_Step_5(env: &mut SnowballEnv, context: &mut Context) -> bool {
let mut among_var;
env.ket = env.cursor;
among_var = env.find_among_b(A_8, context);
if among_var == 0 {
return false;
}
env.bra = env.cursor;
if among_var == 1 {
'lab0: loop {
let v_1 = env.limit - env.cursor;
'lab1: loop {
if !r_R2(env, context) {
break 'lab1;
}
break 'lab0;
}
env.cursor = env.limit - v_1;
if !r_R1(env, context) {
return false;
}
let v_2 = env.limit - env.cursor;
'lab2: loop {
if !r_shortv(env, context) {
break 'lab2;
}
return false;
}
env.cursor = env.limit - v_2;
break 'lab0;
}
if !env.slice_del() {
return false;
}
} else if among_var == 2 {
if !r_R2(env, context) {
return false;
}
if !env.eq_s_b(&"l") {
return false;
}
if !env.slice_del() {
return false;
}
}
return true;
}
fn r_exception2(env: &mut SnowballEnv, context: &mut Context) -> bool {
env.ket = env.cursor;
if env.find_among_b(A_9, context) == 0 {
return false;
}
env.bra = env.cursor;
if env.cursor > env.limit_backward {
return false;
}
return true;
}
fn r_exception1(env: &mut SnowballEnv, context: &mut Context) -> bool {
let mut among_var;
env.bra = env.cursor;
among_var = env.find_among(A_10, context);
if among_var == 0 {
return false;
}
env.ket = env.cursor;
if env.cursor < env.limit {
return false;
}
if among_var == 1 {
if !env.slice_from("ski") {
return false;
}
} else if among_var == 2 {
if !env.slice_from("sky") {
return false;
}
} else if among_var == 3 {
if !env.slice_from("die") {
return false;
}
} else if among_var == 4 {
if !env.slice_from("lie") {
return false;
}
} else if among_var == 5 {
if !env.slice_from("tie") {
return false;
}
} else if among_var == 6 {
if !env.slice_from("idl") {
return false;
}
} else if among_var == 7 {
if !env.slice_from("gentl") {
return false;
}
} else if among_var == 8 {
if !env.slice_from("ugli") {
return false;
}
} else if among_var == 9 {
if !env.slice_from("earli") {
return false;
}
} else if among_var == 10 {
if !env.slice_from("onli") {
return false;
}
} else if among_var == 11 {
if !env.slice_from("singl") {
return false;
}
}
return true;
}
fn r_postlude(env: &mut SnowballEnv, context: &mut Context) -> bool {
if !context.b_Y_found {
return false;
}
'replab0: loop{
let v_1 = env.cursor;
'lab1: for _ in 0..1 {
'golab2: loop {
let v_2 = env.cursor;
'lab3: loop {
env.bra = env.cursor;
if !env.eq_s(&"Y") {
break 'lab3;
}
env.ket = env.cursor;
env.cursor = v_2;
break 'golab2;
}
env.cursor = v_2;
if env.cursor >= env.limit {
break 'lab1;
}
env.next_char();
}
if !env.slice_from("y") {
return false;
}
continue 'replab0;
}
env.cursor = v_1;
break 'replab0;
}
return true;
}
pub fn stem(env: &mut SnowballEnv) -> bool {
let mut context = &mut Context {
b_Y_found: false,
i_p2: 0,
i_p1: 0,
};
'lab0: loop {
let v_1 = env.cursor;
'lab1: loop {
if !r_exception1(env, context) {
break 'lab1;
}
break 'lab0;
}
env.cursor = v_1;
'lab2: loop {
let v_2 = env.cursor;
'lab3: loop {
if !env.hop(3) {
break 'lab3;
}
break 'lab2;
}
env.cursor = v_2;
break 'lab0;
}
env.cursor = v_1;
r_prelude(env, context);
r_mark_regions(env, context);
env.limit_backward = env.cursor;
env.cursor = env.limit;
let v_5 = env.limit - env.cursor;
r_Step_1a(env, context);
env.cursor = env.limit - v_5;
'lab4: loop {
let v_6 = env.limit - env.cursor;
'lab5: loop {
if !r_exception2(env, context) {
break 'lab5;
}
break 'lab4;
}
env.cursor = env.limit - v_6;
let v_7 = env.limit - env.cursor;
r_Step_1b(env, context);
env.cursor = env.limit - v_7;
let v_8 = env.limit - env.cursor;
r_Step_1c(env, context);
env.cursor = env.limit - v_8;
let v_9 = env.limit - env.cursor;
r_Step_2(env, context);
env.cursor = env.limit - v_9;
let v_10 = env.limit - env.cursor;
r_Step_3(env, context);
env.cursor = env.limit - v_10;
let v_11 = env.limit - env.cursor;
r_Step_4(env, context);
env.cursor = env.limit - v_11;
let v_12 = env.limit - env.cursor;
r_Step_5(env, context);
env.cursor = env.limit - v_12;
break 'lab4;
}
env.cursor = env.limit_backward;
let v_13 = env.cursor;
r_postlude(env, context);
env.cursor = v_13;
break 'lab0;
}
return true;
}
================================================
FILE: src/snowball/algorithms/mod.rs
================================================
// Have a look at build.rs
//include!(concat!(env!("OUT_DIR"), "/lang_include.rs"));
pub mod english_stemmer;
================================================
FILE: src/snowball/among.rs
================================================
use crate::snowball::SnowballEnv;
pub struct Among<T: 'static>(pub &'static str,
pub i32,
pub i32,
pub Option<&'static (dyn Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>);
================================================
FILE: src/snowball/mod.rs
================================================
// TODO: add Snowball license in here
pub mod algorithms;
mod among;
mod snowball_env;
// TODO: why do we need this `crate::`?
pub use crate::snowball::among::Among;
pub use crate::snowball::snowball_env::SnowballEnv;
================================================
FILE: src/snowball/snowball_env.rs
================================================
use std::borrow::Cow;
use crate::snowball::Among;
#[derive(Debug, Clone)]
pub struct SnowballEnv<'a> {
pub current: Cow<'a, str>,
pub cursor: i32,
pub limit: i32,
pub limit_backward: i32,
pub bra: i32,
pub ket: i32,
}
impl<'a> SnowballEnv<'a> {
pub fn create(value: &'a str) -> Self {
let len = value.len();
SnowballEnv {
current: Cow::from(value),
cursor: 0,
limit: len as i32,
limit_backward: 0,
bra: 0,
ket: len as i32,
}
}
pub fn get_current(self) -> Cow<'a, str> {
self.current
}
pub fn set_current(&mut self, current: &'a str) {
self.current = Cow::from(current);
}
pub fn set_current_s(&mut self, current: String) {
self.current = Cow::from(current);
}
fn replace_s(&mut self, bra: i32, ket: i32, s: &str) -> i32 {
let adjustment = s.len() as i32 - (ket - bra);
let mut result = String::with_capacity(self.current.len());
{
let (lhs, _) = self.current.split_at(bra as usize);
let (_, rhs) = self.current.split_at(ket as usize);
result.push_str(lhs);
result.push_str(s);
result.push_str(rhs);
}
// ... not very nice...
let new_lim = self.limit + adjustment;
self.limit = new_lim;
if self.cursor >= ket {
let new_cur = self.cursor + adjustment;
self.cursor = new_cur;
} else if self.cursor > bra {
self.cursor = bra
}
self.current = Cow::from(result);
adjustment
}
/// Check if s is after cursor.
/// If so, move cursor to the end of s
pub fn eq_s(&mut self, s: &str) -> bool {
if self.cursor >= self.limit {
return false;
}
if self.current[(self.cursor as usize)..].starts_with(s) {
self.cursor += s.len() as i32;
while !self.current.is_char_boundary(self.cursor as usize) {
self.cursor += 1;
}
true
} else {
false
}
}
/// Check if 's' is before cursor
/// If so, move cursor to the beginning of s
pub fn eq_s_b(&mut self, s: &str) -> bool {
if (self.cursor - self.limit_backward) < s.len() as i32 {
false
// Check if cursor -s.len is a char boundary. if not well... return false obv
} else if !self.current.is_char_boundary(self.cursor as usize - s.len()) ||
!self.current[self.cursor as usize - s.len()..].starts_with(s) {
false
} else {
self.cursor -= s.len() as i32;
true
}
}
/// Replace string between `bra` and `ket` with s
pub fn slice_from(&mut self, s: &str) -> bool {
let (bra, ket) = (self.bra, self.ket);
self.replace_s(bra, ket, s);
true
}
/// Move cursor to next character
pub fn next_char(&mut self) {
self.cursor += 1;
while !self.current.is_char_boundary(self.cursor as usize) {
self.cursor += 1;
}
}
/// Move cursor to previous character
pub fn previous_char(&mut self) {
self.cursor -= 1;
while !self.current.is_char_boundary(self.cursor as usize) {
self.cursor -= 1;
}
}
pub fn hop(&mut self, mut delta: i32) -> bool {
let mut res = self.cursor;
while delta > 0 {
delta -= 1;
if res >= self.limit {
return false;
}
res += 1;
while res < self.limit && !self.current.is_char_boundary(res as usize) {
res += 1;
}
}
self.cursor = res;
return true;
}
pub fn hop_checked(&mut self, delta: i32) -> bool {
return delta >= 0 && self.hop(delta);
}
pub fn hop_back(&mut self, mut delta: i32) -> bool {
let mut res = self.cursor;
while delta > 0 {
delta -= 1;
if res <= self.limit_backward {
return false;
}
res -= 1;
while res > self.limit_backward && !self.current.is_char_boundary(res as usize) {
res -= 1;
}
}
self.cursor = res;
return true;
}
pub fn hop_back_checked(&mut self, delta: i32) -> bool {
return delta >= 0 && self.hop_back(delta);
}
// A grouping is represented by a minimum code point, a maximum code point,
// and a bitfield of which code points in that range are in the grouping.
// For example, in english.sbl, valid_LI is 'cdeghkmnrt'.
// The minimum and maximum code points are 99 and 116,
// so every time one of these grouping functions is called for g_valid_LI,
// min must be 99 and max must be 116. There are 18 code points within that
// range (inclusive) so the grouping is represented with 18 bits, plus 6 bits of padding:
//
// cdefghij klmnopqr st
// 11101100 10110001 01000000
//
// The first bit is the least significant.
// Those three bytes become &[0b00110111, 0b10001101, 0b00000010],
// which is &[55, 141, 2], which is how g_valid_LI is defined in english.rs.
/// Check if the char the cursor points to is in the grouping
pub fn in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
if self.cursor >= self.limit {
return false;
}
if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
let mut ch = chr as u32; //codepoint as integer
if ch > max || ch < min {
return false;
}
ch -= min;
if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
return false;
}
self.next_char();
return true;
}
return false;
}
pub fn in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
if self.cursor <= self.limit_backward {
return false;
}
self.previous_char();
if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
let mut ch = chr as u32; //codepoint as integer
self.next_char();
if ch > max || ch < min {
return false;
}
ch -= min;
if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
return false;
}
self.previous_char();
return true;
}
return false;
}
pub fn out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
if self.cursor >= self.limit {
return false;
}
if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
let mut ch = chr as u32; //codepoint as integer
if ch > max || ch < min {
self.next_char();
return true;
}
ch -= min;
if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
self.next_char();
return true;
}
}
return false;
}
pub fn out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
if self.cursor <= self.limit_backward {
return false;
}
self.previous_char();
if let Some(chr) = self.current[self.cursor as usize..].chars().next() {
let mut ch = chr as u32; //codepoint as integer
self.next_char();
if ch > max || ch < min {
self.previous_char();
return true;
}
ch -= min;
if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
self.previous_char();
return true;
}
}
return false;
}
/// Helper function that removes the string slice between `bra` and `ket`
pub fn slice_del(&mut self) -> bool {
self.slice_from("")
}
pub fn insert(&mut self, bra: i32, ket: i32, s: &str) {
let adjustment = self.replace_s(bra, ket, s);
if bra <= self.bra {
self.bra = self.bra + adjustment;
}
if bra <= self.ket {
self.ket = self.ket + adjustment;
}
}
pub fn assign_to(&mut self) -> String {
self.current[0..self.limit as usize].to_string()
}
pub fn slice_to(&mut self) -> String {
self.current[self.bra as usize..self.ket as usize].to_string()
}
pub fn find_among<T>(&mut self, amongs: &[Among<T>], context: &mut T) -> i32 {
use std::cmp::min;
let mut i: i32 = 0;
let mut j: i32 = amongs.len() as i32;
let c = self.cursor;
let l = self.limit;
let mut common_i = 0i32;
let mut common_j = 0i32;
let mut first_key_inspected = false;
loop {
let k = i + ((j - i) >> 1);
let mut diff: i32 = 0;
let mut common = min(common_i, common_j);
let w = &amongs[k as usize];
for lvar in common..w.0.len() as i32 {
if c + common == l {
diff = -1;
break;
}
diff = self.current.as_bytes()[(c + common) as usize] as i32 - w.0.as_bytes()[lvar as usize] as i32;
if diff != 0 {
break;
}
common += 1;
}
if diff < 0 {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if j - i <= 1 {
if i > 0 {
break;
}
if j == i {
break;
}
if first_key_inspected {
break;
}
first_key_inspected = true;
}
}
loop {
let w = &amongs[i as usize];
if common_i >= w.0.len() as i32{
self.cursor = c + w.0.len() as i32;
if let Some(ref method) = w.3 {
let res = method(self, context);
self.cursor = c + w.0.len() as i32;
if res {
return w.2;
}
} else {
return w.2;
}
}
i = w.1;
if i < 0 {
return 0;
}
}
}
pub fn find_among_b<T>(&mut self, amongs: &[Among<T>], context: &mut T) -> i32 {
let mut i: i32 = 0;
let mut j: i32 = amongs.len() as i32;
let c = self.cursor;
let lb = self.limit_backward;
let mut common_i = 0i32;
let mut common_j = 0i32;
let mut first_key_inspected = false;
loop {
let k = i + ((j - i) >> 1);
let mut diff: i32 = 0;
let mut common = if common_i < common_j {
common_i
} else {
common_j
};
let w = &amongs[k as usize];
for lvar in (0..w.0.len() - common as usize).rev() {
if c - common == lb {
diff = -1;
break;
}
diff = self.current.as_bytes()[(c - common - 1) as usize] as i32 - w.0.as_bytes()[lvar] as i32;
if diff != 0 {
break;
}
// Count up commons. But not one character but the byte width of that char
common += 1;
}
if diff < 0 {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if j - i <= 1 {
if i > 0 {
break;
}
if j == i {
break;
}
if first_key_inspected {
break;
}
first_key_inspected = true;
}
}
loop {
let w = &amongs[i as usize];
if common_i >= w.0.len() as i32 {
self.cursor = c - w.0.len() as i32;
if let Some(ref method) = w.3 {
let res = method(self, context);
self.cursor = c - w.0.len() as i32;
if res {
return w.2;
}
} else {
return w.2;
}
}
i = w.1;
if i < 0 {
return 0;
}
}
}
}
gitextract_z9nm0izk/
├── .gitignore
├── CONTRIBUTING.md
├── Cargo.toml
├── LICENSE
├── README.md
└── src/
├── index.html
├── index.js
├── lexer.rs
├── main.rs
├── model.rs
├── server.rs
└── snowball/
├── algorithms/
│ ├── english_stemmer.rs
│ └── mod.rs
├── among.rs
├── mod.rs
└── snowball_env.rs
SYMBOL INDEX (80 symbols across 8 files)
FILE: src/index.js
function search (line 2) | async function search(prompt) {
FILE: src/lexer.rs
type Lexer (line 1) | pub struct Lexer<'a> {
function new (line 6) | pub fn new(content: &'a [char]) -> Self {
function trim_left (line 10) | fn trim_left(&mut self) {
function chop (line 16) | fn chop(&mut self, n: usize) -> &'a [char] {
function chop_while (line 22) | fn chop_while<P>(&mut self, mut predicate: P) -> &'a [char] where P: FnM...
function next_token (line 30) | pub fn next_token(&mut self) -> Option<String> {
type Item (line 53) | type Item = String;
method next (line 55) | fn next(&mut self) -> Option<Self::Item> {
FILE: src/main.rs
function parse_entire_txt_file (line 19) | fn parse_entire_txt_file(file_path: &Path) -> Result<String, ()> {
function parse_entire_pdf_file (line 25) | fn parse_entire_pdf_file(file_path: &Path) -> Result<String, ()> {
function parse_entire_xml_file (line 55) | fn parse_entire_xml_file(file_path: &Path) -> Result<String, ()> {
function parse_entire_file_by_extension (line 76) | fn parse_entire_file_by_extension(file_path: &Path) -> Result<String, ()> {
function save_model_as_json (line 95) | fn save_model_as_json(model: &Model, index_path: &Path) -> Result<(), ()> {
function add_folder_to_model (line 111) | fn add_folder_to_model(dir_path: &Path, model: Arc<Mutex<Model>>, proces...
function usage (line 172) | fn usage(program: &str) {
function entry (line 178) | fn entry() -> Result<(), ()> {
function main (line 244) | fn main() -> ExitCode {
FILE: src/model.rs
type DocFreq (line 7) | type DocFreq = HashMap<String, usize>;
type TermFreq (line 8) | type TermFreq = HashMap<String, usize>;
type Doc (line 10) | pub struct Doc {
type Docs (line 16) | type Docs = HashMap<PathBuf, Doc>;
type Model (line 19) | pub struct Model {
method remove_document (line 25) | fn remove_document(&mut self, file_path: &Path) {
method requires_reindexing (line 35) | pub fn requires_reindexing(&mut self, file_path: &Path, last_modified:...
method search_query (line 42) | pub fn search_query(&self, query: &[char]) -> Vec<(PathBuf, f32)> {
method add_document (line 60) | pub fn add_document(&mut self, file_path: PathBuf, last_modified: Syst...
function compute_tf (line 87) | fn compute_tf(t: &str, doc: &Doc) -> f32 {
function compute_idf (line 93) | fn compute_idf(t: &str, n: usize, df: &DocFreq) -> f32 {
FILE: src/server.rs
function serve_404 (line 9) | fn serve_404(request: Request) -> io::Result<()> {
function serve_500 (line 13) | fn serve_500(request: Request) -> io::Result<()> {
function serve_400 (line 17) | fn serve_400(request: Request, message: &str) -> io::Result<()> {
function serve_bytes (line 21) | fn serve_bytes(request: Request, bytes: &[u8], content_type: &str) -> io...
function serve_api_search (line 29) | fn serve_api_search(model: Arc<Mutex<Model>>, mut request: Request) -> i...
function serve_api_stats (line 60) | fn serve_api_stats(model: Arc<Mutex<Model>>, request: Request) -> io::Re...
function serve_request (line 89) | fn serve_request(model: Arc<Mutex<Model>>, request: Request) -> io::Resu...
function start (line 111) | pub fn start(address: &str, model: Arc<Mutex<Model>>) -> Result<(), ()> {
FILE: src/snowball/algorithms/english_stemmer.rs
type Context (line 161) | struct Context {
function r_prelude (line 167) | fn r_prelude(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_mark_regions (line 236) | fn r_mark_regions(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_shortv (line 308) | fn r_shortv(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_R1 (line 338) | fn r_R1(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_R2 (line 345) | fn r_R2(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_Step_1a (line 352) | fn r_Step_1a(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_Step_1b (line 419) | fn r_Step_1b(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_Step_1c (line 491) | fn r_Step_1c(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_Step_2 (line 523) | fn r_Step_2(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_Step_3 (line 604) | fn r_Step_3(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_Step_4 (line 646) | fn r_Step_4(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_Step_5 (line 683) | fn r_Step_5(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_exception2 (line 731) | fn r_exception2(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_exception1 (line 743) | fn r_exception1(env: &mut SnowballEnv, context: &mut Context) -> bool {
function r_postlude (line 802) | fn r_postlude(env: &mut SnowballEnv, context: &mut Context) -> bool {
function stem (line 837) | pub fn stem(env: &mut SnowballEnv) -> bool {
FILE: src/snowball/among.rs
type Among (line 3) | pub struct Among<T: 'static>(pub &'static str,
FILE: src/snowball/snowball_env.rs
type SnowballEnv (line 5) | pub struct SnowballEnv<'a> {
function create (line 16) | pub fn create(value: &'a str) -> Self {
function get_current (line 28) | pub fn get_current(self) -> Cow<'a, str> {
function set_current (line 32) | pub fn set_current(&mut self, current: &'a str) {
function set_current_s (line 36) | pub fn set_current_s(&mut self, current: String) {
function replace_s (line 40) | fn replace_s(&mut self, bra: i32, ket: i32, s: &str) -> i32 {
function eq_s (line 65) | pub fn eq_s(&mut self, s: &str) -> bool {
function eq_s_b (line 82) | pub fn eq_s_b(&mut self, s: &str) -> bool {
function slice_from (line 96) | pub fn slice_from(&mut self, s: &str) -> bool {
function next_char (line 103) | pub fn next_char(&mut self) {
function previous_char (line 111) | pub fn previous_char(&mut self) {
function hop (line 118) | pub fn hop(&mut self, mut delta: i32) -> bool {
function hop_checked (line 134) | pub fn hop_checked(&mut self, delta: i32) -> bool {
function hop_back (line 138) | pub fn hop_back(&mut self, mut delta: i32) -> bool {
function hop_back_checked (line 154) | pub fn hop_back_checked(&mut self, delta: i32) -> bool {
function in_grouping (line 173) | pub fn in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
function in_grouping_b (line 192) | pub fn in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
function out_grouping (line 213) | pub fn out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
function out_grouping_b (line 232) | pub fn out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bo...
function slice_del (line 256) | pub fn slice_del(&mut self) -> bool {
function insert (line 260) | pub fn insert(&mut self, bra: i32, ket: i32, s: &str) {
function assign_to (line 270) | pub fn assign_to(&mut self) -> String {
function slice_to (line 274) | pub fn slice_to(&mut self) -> String {
function find_among (line 278) | pub fn find_among<T>(&mut self, amongs: &[Among<T>], context: &mut T) ->...
function find_among_b (line 348) | pub fn find_among_b<T>(&mut self, amongs: &[Among<T>], context: &mut T) ...
Condensed preview — 16 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (62K chars).
[
{
"path": ".gitignore",
"chars": 8,
"preview": "/target\n"
},
{
"path": "CONTRIBUTING.md",
"chars": 651,
"preview": "I have very limited resources in terms of handling feedback on my projects, sorry. So here are the limitations to keep i"
},
{
"path": "Cargo.toml",
"chars": 313,
"preview": "[package]\nname = \"seroost\"\nversion = \"0.1.0\"\nedition = \"2021\"\n\n# See more keys and their definitions at https://doc.rust"
},
{
"path": "LICENSE",
"chars": 1074,
"preview": "Copyright 2023 Alexey Kutepov <reximkut@gmail.com>\n\nPermission is hereby granted, free of charge, to any person obtainin"
},
{
"path": "README.md",
"chars": 198,
"preview": "# Local Search Engine in Rust\n\n**THIS SOFTWARE IS UNFINISHED!!! Don't have any high expectations.**\n\n## Quick Start\n\n```"
},
{
"path": "src/index.html",
"chars": 244,
"preview": "<html>\n <head>\n <title>Seroost</title>\n </head>\n <body>\n <h1>Provide Your Query:</h1>\n <in"
},
{
"path": "src/index.js",
"chars": 839,
"preview": "// TODO: live update results as you type\nasync function search(prompt) {\n const results = document.getElementById(\"re"
},
{
"path": "src/lexer.rs",
"chars": 1631,
"preview": "pub struct Lexer<'a> {\n content: &'a [char],\n}\n\nimpl<'a> Lexer<'a> {\n pub fn new(content: &'a [char]) -> Self {\n "
},
{
"path": "src/main.rs",
"chars": 8835,
"preview": "use std::fs::{self, File};\nuse std::path::{Path};\nuse xml::reader::{XmlEvent, EventReader};\nuse xml::common::{Position, "
},
{
"path": "src/model.rs",
"chars": 2852,
"preview": "use std::collections::HashMap;\nuse std::path::{PathBuf, Path};\nuse serde::{Deserialize, Serialize};\nuse super::lexer::Le"
},
{
"path": "src/server.rs",
"chars": 4494,
"preview": "use std::str;\nuse std::io;\nuse std::sync::{Arc, Mutex};\n\nuse super::model::*;\n\nuse tiny_http::{Server, Request, Response"
},
{
"path": "src/snowball/algorithms/english_stemmer.rs",
"chars": 24192,
"preview": "//! Generated by Snowball 2.2.0 - https://snowballstem.org/\n\n#![allow(non_snake_case)]\n#![allow(non_upper_case_globals)]"
},
{
"path": "src/snowball/algorithms/mod.rs",
"chars": 110,
"preview": "// Have a look at build.rs\n//include!(concat!(env!(\"OUT_DIR\"), \"/lang_include.rs\"));\npub mod english_stemmer;\n"
},
{
"path": "src/snowball/among.rs",
"chars": 260,
"preview": "use crate::snowball::SnowballEnv;\n\npub struct Among<T: 'static>(pub &'static str,\n pub i32,\n"
},
{
"path": "src/snowball/mod.rs",
"chars": 219,
"preview": "// TODO: add Snowball license in here\npub mod algorithms;\nmod among;\nmod snowball_env;\n\n// TODO: why do we need this `cr"
},
{
"path": "src/snowball/snowball_env.rs",
"chars": 12897,
"preview": "use std::borrow::Cow;\nuse crate::snowball::Among;\n\n#[derive(Debug, Clone)]\npub struct SnowballEnv<'a> {\n pub current:"
}
]
About this extraction
This page contains the full source code of the tsoding/seroost GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 16 files (57.4 KB), approximately 15.5k tokens, and a symbol index with 80 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.