Repository: sudeep9/mojo Branch: main Commit: 730a25eeb9e8 Files: 53 Total size: 125.9 KB Directory structure: gitextract_2dkkx48z/ ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── build.sh ├── crates/ │ ├── mojo-cli/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── buckets.rs │ │ ├── commit.rs │ │ ├── diff.rs │ │ ├── iget.rs │ │ ├── iview.rs │ │ ├── main.rs │ │ ├── state.rs │ │ └── truncate.rs │ ├── mojofs/ │ │ ├── .gitignore │ │ ├── Cargo.toml │ │ ├── meson.build │ │ ├── src/ │ │ │ ├── error.rs │ │ │ ├── kvfile.rs │ │ │ ├── lib.rs │ │ │ ├── native_file.rs │ │ │ ├── open_options.rs │ │ │ ├── vfs.rs │ │ │ └── vfsfile.rs │ │ └── tests/ │ │ └── mojofs_test.rs │ ├── mojoio/ │ │ ├── Cargo.toml │ │ └── src/ │ │ ├── error.rs │ │ ├── lib.rs │ │ └── nix.rs │ └── mojokv/ │ ├── .gitignore │ ├── Cargo.toml │ └── src/ │ ├── bmap.rs │ ├── bucket.rs │ ├── error.rs │ ├── index/ │ │ ├── mem.rs │ │ └── mod.rs │ ├── keymap.rs │ ├── lib.rs │ ├── state.rs │ ├── store.rs │ ├── utils.rs │ └── value.rs ├── docs/ │ ├── design.md │ ├── source.md │ └── user-guide.md ├── meson.build ├── sqlite-ext/ │ ├── mojo.c │ └── mojofs.h └── test-scripts/ ├── commands.py ├── perftest.py ├── test.sql ├── test2.sql └── testdb.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ target log log* *.log *.dylib .vscode *.db *.swp *-journal *-wal build Cargo.lock testdbs __pycache__ .DS_Store ================================================ FILE: Cargo.toml ================================================ [workspace] members = [ "crates/mojofs", "crates/mojokv", "crates/mojoio", "crates/mojo-cli", ] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2022 Sudeep Jathar Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # MojoFS MojoFS is a versioning, userspace filesystem for sqlite DB. It is tailor made for sqlite and not supposed to be used as a general purpose fs. The main feature of the fs is versioning/snapshotting. Only one version is writable and all the old versions are immutable. - [MojoFS](#mojofs) - [License](#license) - [Development status](#development-status) - [Build](#build) - [Usage](#usage) - [Create database using mojo and insert few records](#create-database-using-mojo-and-insert-few-records) - [Select data](#select-data) - [Commit the database](#commit-the-database) - [Write to active version=2](#write-to-active-version2) - [Read old version=1](#read-old-version1) - [Read active version=2:](#read-active-version2) - [Docs](#docs) - [Source map](#source-map) - [Limits](#limits) - [Testing](#testing) - [Performance](#performance) - [Road to v1.0](#road-to-v10) # License I have changed it to MIT. See the [LICENSE](/LICENSE) file. # Development status |Item|Value| |-------|------| |Quality|pre-alpha| |Maintenance|active| # Build At present only mac/linux is supported. Windows is not supported only because I do not have windows machine. This may change in future. The build expects the following in the environment: 1. Meson Build + Ninja (see [here](https://mesonbuild.com/Getting-meson.html)) 2. C compiler (gcc or clang) 3. Rust version v1.59+ 4. sqlite headers + libraries 5. Optional: python3 for testing (most versions should be ok) ``` git clone github.com/sudeep9/mojo cd mojo ./build.sh release ``` Following artifacts will be in `build` dir: * `build/libmojo.dylib` (`.so` extension in linux) => The sqlite extension/vfs * `build/mojo-cli` => mojo cli tool to manage the file system # Usage All the examples below uses sqlite3 binary. However, you can use any bindings of sqlite. ## Create database using mojo and insert few records ``` rm -fR a.db sqlite3 < [here](https://github.com/sudeep9/mojo/blob/main/docs/design.md) * user-guide => [here](https://github.com/sudeep9/mojo/blob/main/docs/user-guide.md) ### Source map * All the rust code is under `crates` folder * All the docs are under `docs` folder * The `sqlite-ext` folder has C code which is compiled down to shared lib * The `test-scripts` has various assorted test scripts which includes perf & black-box test Crate wise details are at: [here](https://github.com/sudeep9/mojo/blob/main/docs/source.md) # Limits For a page size = 4KB (which is the default for sqlite for some years now) following are the limits: * Max sqlite db file size (KB) = `pow(2,32) * 4` = 17179869184 KB or 16TB 16TB is logical size i.e. file size reported by stat like call in any version. Since there could be multiple versions of the file, the total size of all such versions taken together can exceed 16TB. * Max version num = `pow(2,24)` = 16777216 or 16 million To put 16M versions in perspective, even if you create a version every 1 min, it will take ~31.92 years to reach the max version. Note: All these limits are actually artificial to keep the memory usage reasonable. In future, these will be tunable and also have the ability to baseline the versions. # Testing I wanted to use sqlite [test harness](https://www.sqlite.org/th3.html) but it requires license. Quoting from the test harness link: > SQLite itself is in the public domain and can be used for any purpose. But TH3 is proprietary and requires a license. Instead there is `testdb.py` for black box testing and `perftest.py` for perf tests. At present the `testdb.py` tests combinations of the following: ``` page_sizes = [4096] journal_modes = ["OFF", "WAL", "MEMORY", "DELETE", "TRUNCATE", "PERSIST"] vacuum_modes = ["NONE", "FULL", "INCREMENTAL"] ``` For each of the combination, there are about ~11 subtests so in all 18 x 11 = 198 tests. These are early days and off-course there is a long way to go. To run the full suite: ``` python3 testdb.py build/libmojo full ``` # Performance About `10_000_000` rows are inserted and then for reading we select the rows and get the row count. Finally it updates half the rows. To run the perf test: ``` MOJOKV_CLI=build/mojo-cli python3 perftest.py ./build/libmojo ``` Output on 2018/19 macbook: ``` Running perf for: insert vfs=std time elapsed (s): 20.524982929229736 vfs=mojo time elapsed (s): 21.202611923217773 Mojo takes 1.033 times than std vfs ------------------------ Running perf for: update rows vfs=std time elapsed (s): 2.871242046356201 vfs=mojo time elapsed (s): 2.439574956893921 Mojo takes 0.85 times than std vfs ------------------------ Running perf for: select select iter count: 10000000 vfs=std time elapsed (s): 8.659775018692017 select iter count: 10000000 vfs=mojo time elapsed (s): 5.907814025878906 Mojo takes 0.682 times than std vfs ------------------------ Running perf for: row count row count: 0 vfs=std time elapsed (s): 2.96425199508667 row count: 0 vfs=mojo time elapsed (s): 1.5106308460235596 Mojo takes 0.51 times than std vfs ------------------------ ``` The writes being only `1.033` times worse is in line with my expectations. However, I am investigating why the reads are so better with mojo. My guess as of now is that in the standard default vfs at https://github.com/sqlite/sqlite/blob/master/src/os_unix.c does not use pread, whereas mojo uses pread. Lack of pread results into two system call i.e. seek + read. This ***might*** explain the perf difference. This need further confirmation though. See the comment and code in the c file above: ``` ** ... Since SQLite does not define USE_PREAD ** in any form by default, we will not attempt to define _XOPEN_SOURCE. ** See tickets #2741 and #2681. ``` In seekAndRead function: ``` #if defined(USE_PREAD) got = osPread(id->h, pBuf, cnt, offset); SimulateIOError( got = -1 ); #elif defined(USE_PREAD64) got = osPread64(id->h, pBuf, cnt, offset); SimulateIOError( got = -1 ); #else newOffset = lseek(id->h, offset, SEEK_SET); SimulateIOError( newOffset = -1 ); if( newOffset<0 ){ storeLastErrno((unixFile*)id, errno); return -1; } got = osRead(id->h, pBuf, cnt); ``` # Road to v1.0 It needs atleast the following: - [ ] Top-notch unit & black box test coverage - [ ] Ease of use e.g debugability, add more mojo-cli admin commands - [ ] Ability to diff the versions - [ ] Ability to delete versions - [ ] Ability to merge versions (not like git merge) - [ ] Ability to recover from corrupted fs. - [ ] Stabilize on-disk format - [ ] User guide A lot of the above needs to be clearly defined. ================================================ FILE: build.sh ================================================ #!/usr/bin/env bash export BUILD_PROFILE=debug export CARGO_OPT="" export MESON_BUILD_PROFILE="debug" export MESON_BUILD_DIR="build" if [ $# -gt 0 ]; then if [ $1 = "release" ]; then export BUILD_PROFILE=release export CARGO_OPT="--release" export MESON_BUILD_PROFILE="release" fi fi echo "BUILD_PROFILE=$BUILD_PROFILE" echo "-------------------------------" echo " [Rust build starting]" echo "-------------------------------" cargo build $CARGO_OPT if [ $? -ne 0 ]; then echo "Error: Cargo build failed with $?" exit $? fi mkdir -p $MESON_BUILD_DIR rm -fR $MESON_BUILD_DIR/* cp target/$BUILD_PROFILE/mojo-cli $MESON_BUILD_DIR/. if [ $? -ne 0 ]; then echo "Copying failed" exit $? fi echo "-------------------------------" echo " [C build starting]" echo "-------------------------------" meson setup --buildtype=$MESON_BUILD_PROFILE $MESON_BUILD_DIR if [ $? -ne 0 ]; then echo "Error: meson reconfigure failed with $?" exit $? fi meson compile -C $MESON_BUILD_DIR if [ $? -ne 0 ]; then echo "Error: meson compile failed with $?" exit $? fi echo ">> The build artifacts are at the dir: $MESON_BUILD_DIR" if [ "$MOJO_TEST" != "" ]; then echo "-------------------------------" echo " [Mojo test starting]" echo "-------------------------------" export MOJOKV_CLI=./target/$BUILD_PROFILE/mojo-cli python3 test-scripts/testdb.py $MESON_BUILD_DIR/mojo if [ $? -ne 0 ]; then echo "Error: mojo test failed with $?" exit 1 fi fi ================================================ FILE: crates/mojo-cli/Cargo.toml ================================================ [package] name = "mojo-cli" version = "0.1.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] mojokv = {path="../mojokv"} log = "0.4.17" env_logger = "0.9.0" clap = {version="3.2.6", features=["derive"] } anyhow = "1.0.58" ================================================ FILE: crates/mojo-cli/src/buckets.rs ================================================ use anyhow::Error; use mojokv::BucketMap; pub fn cmd(kvpath: &std::path::Path, ver: u32) -> Result<(), Error> { let bmap = BucketMap::load(kvpath, ver)?; for (bucket_name, ver) in bmap.map()?.iter() { println!("{} -> {}", bucket_name, ver); } Ok(()) } ================================================ FILE: crates/mojo-cli/src/commit.rs ================================================ use anyhow::Error; use mojokv::Store; pub fn cmd(kvpath: &std::path::Path) -> Result<(), Error> { let st = Store::writable(&kvpath, false, None, None)?; println!("active version before commit: {}", st.active_ver()); let new_ver = st.commit()?; println!("active version after commit: {}", new_ver); Ok(()) } ================================================ FILE: crates/mojo-cli/src/diff.rs ================================================ use anyhow::Error; use mojokv::KVStore; pub fn cmd(kvpath: &std::path::Path, fver: u32, tver: u32) -> Result<(), Error> { if fver >= tver { return Err(Error::msg("'from' version cannot be greater than or equal 'to' version")); } let state = KVStore::load_state(kvpath)?; let f_index = KVStore::load_index(kvpath, fver)?; let t_index = KVStore::load_index(kvpath, tver)?; let mut key = 0u32; for (slot_index, t_slot) in t_index.kmap.slot_map.iter().enumerate() { let f_slot = &f_index.kmap.slot_map[slot_index]; match (f_slot, t_slot) { (Some(fs), Some(ts)) => { if fs.len() != ts.len() { return Err(Error::msg(format!("Slot length mismatch {} {}", fs.len(), ts.len()))); } for (j,(fv, tv)) in std::iter::zip(fs, ts).enumerate() { if fv.get_ver() != tv.get_ver() { println!("M k={} fv={} tv={} fo={} to={}", key+j as u32, fv.get_ver(), tv.get_ver(), fv.get_off(), tv.get_off()); } } }, (Some(_fs), None) => { println!("D {} -> {} deleted", key, key + state.pps); }, (None, Some(_ts)) => { println!("A {} -> {} added", key, key + state.pps); }, (None, None) => {}, } key += state.pps; } Ok(()) } ================================================ FILE: crates/mojo-cli/src/iget.rs ================================================ use anyhow::Error; use mojokv::{Store,BucketOpenMode}; pub fn cmd(kvpath: &std::path::Path, bucket: &str, ver: u32, key: u32) -> Result<(), Error> { let st = Store::readonly(&kvpath, ver)?; let b = st.open(bucket, BucketOpenMode::Read)?; println!("Max key: {}", b.max_key()); match b.get_key(key)? { Some(val) => { println!("value: {:?}", val); }, None => { println!("Key not found") } } Ok(()) } ================================================ FILE: crates/mojo-cli/src/iview.rs ================================================ use anyhow::Error; use mojokv::{Store}; pub fn cmd(kvpath: &std::path::Path, name: &str, ver: u32, additional: bool, keys: bool) -> Result<(), Error> { let st = Store::readonly(kvpath, ver)?; let ret = st.get_index(name)?; //Bucket::load_index(&kvpath, name, ver)?; if ret.is_none() { println!("Bucket {} does not exists", name); } let (uncomp_sz, comp_sz, i) = ret.unwrap(); let h = i.header(); let st = if additional { Some(Store::load_state(kvpath)?) }else{ None }; println!("Format version : {}", h.format_ver); println!("Minimum version : {}", h.min_ver); println!("Maximum version : {}", h.max_ver); println!("Active version : {}", h.active_ver); println!("Pages per slot : {}", h.pps); println!("Maximum key : {}", h.max_key); println!("Compressed size : {}", comp_sz); println!("Uncompressed size : {}", uncomp_sz); if let Some(st) = st { println!("----------------------"); println!("Logical size : {}", st.page_size() * (h.max_key + 1) as u32); } if keys { println!("----------------------"); println!("keys"); for (key, val) in i.iter(0, 0) { println!(" {} {:?}", key, val); } } Ok(()) } ================================================ FILE: crates/mojo-cli/src/main.rs ================================================ mod iget; mod iview; mod state; mod commit; mod buckets; use anyhow::Error; use clap::{Parser, Subcommand}; #[derive(Parser)] #[clap(author, version, about, long_about = None)] #[clap(propagate_version = true)] struct Cli { kvpath: std::path::PathBuf, #[clap(subcommand)] command: Commands, } #[derive(Subcommand)] enum Commands { /// View index value for a key #[clap(name="iget")] MemIndexGet{ #[clap(value_parser)] bucket: String, #[clap(value_parser)] ver: u32, #[clap(value_parser)] key: u32, }, /// View index value for a key #[clap(name="iview")] MemIndexView{ #[clap(value_parser)] bucket: String, #[clap(value_parser)] ver: u32, #[clap(short, action)] additional: bool, #[clap(short, action)] keys: bool }, /// View the current kv state #[clap(name="state")] State{ /// Print additional internal numbers #[clap(short, action)] additional: bool }, /// Commit the store #[clap(name="commit")] Commit{ }, /// List buckets #[clap(name="buckets")] Buckets{ /// Version of the bmap #[clap(value_parser)] ver: u32, }, } fn main() -> Result<(), Error> { env_logger::init(); let cli = Cli::parse(); // You can check for the existence of subcommands, and if found use their // matches just as you would the top level cmd match &cli.command { Commands::MemIndexGet{bucket, ver, key} => { iget::cmd(&cli.kvpath, bucket.as_str(), *ver, *key)?; }, Commands::MemIndexView{bucket, ver, additional, keys} => { iview::cmd(&cli.kvpath, bucket.as_str(), *ver, *additional, *keys)?; }, Commands::State{additional} => { state::cmd(&cli.kvpath, *additional)?; }, Commands::Commit{} => { commit::cmd(&cli.kvpath)?; }, Commands::Buckets{ver} => { buckets::cmd(&cli.kvpath, *ver)?; }, } Ok(()) } ================================================ FILE: crates/mojo-cli/src/state.rs ================================================ use anyhow::Error; use mojokv::{self, Store}; use std::mem::size_of; pub fn cmd(kvpath: &std::path::Path, additional: bool) -> Result<(), Error> { let st = Store::load_state(kvpath)?; println!("Format version : {}", st.format_ver()); println!("Minimum version : {}", st.min_ver()); println!("Active version : {}", st.active_ver()); println!("Pages per slot : {}", st.pps()); println!("Page size : {}", st.page_size()); println!("File header len : {}", st.file_page_sz()); if additional { println!("----------------------------"); println!("Size of KVStore : {} bytes", size_of::()); println!("Size of MemIndex : {} bytes", size_of::()); println!("Size of KeyMap : {} bytes", size_of::()); println!("Size of Value : {} bytes", size_of::()); println!("Size of Slot : {} bytes", size_of::()); } Ok(()) } ================================================ FILE: crates/mojo-cli/src/truncate.rs ================================================ use anyhow::Error; use mojokv::KVStore; pub fn cmd(kvpath: &std::path::Path, sz: usize) -> Result<(), Error> { let st = KVStore::load_state(&kvpath)?; if sz % (st.page_size() as usize) != 0 { return Err(Error::msg(format!("Error: truncate size is not multiple of page sz ({})", st.page_size()))); } let mut store = KVStore::writable(kvpath, st.page_size(), Some(st.pps()))?; store.truncate(sz)?; store.sync()?; Ok(()) } ================================================ FILE: crates/mojofs/.gitignore ================================================ testfs ================================================ FILE: crates/mojofs/Cargo.toml ================================================ [package] name = "mojofs" version = "0.1.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] crate-type = ["staticlib", "lib"] [dev-dependencies] anyhow = "1.0" env_logger = "0.9.0" [dependencies] libsqlite3-sys = {version = "0.24.2", features = ["bundled"]} thiserror = "1.0.31" parking_lot = "0.12.1" mojokv = {path = "../mojokv"} nix = "0.24" log = "0.4.17" env_logger = "0.9.0" ================================================ FILE: crates/mojofs/meson.build ================================================ project('cmojo', 'c') if get_option('buildtype') == 'release' mojo_lib_dir = meson.source_root() + '/../target/release' else mojo_lib_dir = meson.source_root() + '/../target/debug' endif message(mojo_lib_dir) mojo_lib = meson.get_compiler('c').find_library('mojofs', static: true, dirs: [mojo_lib_dir]) shared_library('cmojo', sources:['cmojo.c'], dependencies: [mojo_lib]) ================================================ FILE: crates/mojofs/src/error.rs ================================================ pub const MOJOFS_ERR_NOT_IMPL: i32 = 1; pub const MOJOFS_ERR_IO: i32 = 2; pub const MOJOFS_ERR_NIX: i32 = 3; pub const MOJOFS_ERR_UTF8: i32 = 4; pub const MOJOFS_ERR_MOJOKV: i32 = 5; pub const MOJOFS_ERR_URL_PARSE: i32 = 6; pub const MOJOFS_ERR_INT_PARSE: i32 = 7; pub const MOJOFS_ERR_LARGE_PAGE: i32 = 8; pub const MOJOFS_ERR_ARG_VER_MISSING: i32 = 9; pub const MOJOFS_ERR_ARG_PAGESZ_MISSING: i32 = 10; pub const MOJOFS_ERR_ARG_PPS_MISSING: i32 = 11; #[derive(thiserror::Error, Debug)] pub struct Error { pub code: i32, pub msg: String, } impl Error { pub fn new(code: i32, msg: String) -> Self { Error { code, msg, } } pub fn not_impl() -> Self { Error::new(MOJOFS_ERR_NOT_IMPL, "Not implemented".to_owned()) } } impl From for Error { fn from(err: std::io::Error) -> Self { Error { code: MOJOFS_ERR_IO, msg: format!("{:?}", err), } } } impl From for Error { fn from(err: nix::Error) -> Self { Error { code: MOJOFS_ERR_NIX, msg: err.to_string(), } } } impl From for Error { fn from(err: std::str::Utf8Error) -> Self { Error { code: MOJOFS_ERR_UTF8, msg: err.to_string(), } } } impl From for Error { fn from(err: std::num::ParseIntError) -> Self { Error { code: MOJOFS_ERR_INT_PARSE, msg: err.to_string(), } } } impl From for Error { fn from(err: mojokv::Error) -> Self { Error { code: MOJOFS_ERR_MOJOKV, msg: err.to_string(), } } } impl std::fmt::Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "({}, {})", self.code, self.msg) } } ================================================ FILE: crates/mojofs/src/kvfile.rs ================================================ use mojokv::Bucket; use crate::Error; pub struct KVFile { pub bucket: Bucket, opt: KVFileOpt, } #[derive(Clone)] pub struct KVFileOpt { pub page_sz: u32, pub pps: u32, pub ver: u32, } impl KVFile { pub fn open(bucket: Bucket, opt: KVFileOpt) -> Result { Ok(KVFile{ bucket, opt, }) } pub fn pread(&self, buf: &mut [u8], off: i64) -> Result { log::debug!("kv pread o={}, blen={}", off, buf.len()); if buf.len() > self.opt.page_sz as usize { return Err(Error::new(crate::MOJOFS_ERR_LARGE_PAGE, format!("buf larger ({}) than page size", buf.len()))); } let page_off = off % self.opt.page_sz as i64; let key = off / self.opt.page_sz as i64; let n = match self.bucket.get(key as u32, page_off as u64, buf) { Ok(n) => n, Err(err) => { if let mojokv::Error::KeyNotFoundErr(_) = err { 0 }else{ return Err(err.into()); } } }; if n Result<(), Error> { log::debug!("kv pwrite page key={}, po={} blen={}", key, page_off, buf.len()); self.bucket.put(key, page_off as u64, buf)?; Ok(()) } pub fn pwrite(&mut self, off: i64, buf: &[u8]) -> Result<(), Error> { log::debug!("kv pwrite o={}, blen={}", off, buf.len()); let mut po = off % self.opt.page_sz as i64; let mut key = off / self.opt.page_sz as i64; let mut s = 0usize; let buflen = buf.len(); while s < buflen { let e = (buflen-s).min(self.opt.page_sz as usize - po as usize); self.pwrite_page(key as u32, po as u32, &buf[s..s+e])?; s += e; po = 0; key += self.opt.page_sz as i64; } Ok(()) } pub fn close(self) -> Result<(), Error> { self.bucket.close()?; Ok(()) } pub fn sync(&mut self) -> Result<(), Error> { self.bucket.sync()?; Ok(()) } pub fn filesize(&self) -> Result { Ok(self.bucket.logical_size()) } pub fn truncate(&mut self, new_sz: u64) -> Result<(), Error> { log::debug!("kv truncate {}", new_sz); self.bucket.truncate(new_sz as usize)?; Ok(()) } } ================================================ FILE: crates/mojofs/src/lib.rs ================================================ pub mod vfs; pub mod error; pub mod open_options; pub mod vfsfile; mod native_file; mod kvfile; use std::ffi::CStr; use std::collections::HashMap; use libsqlite3_sys::{ self, sqlite3_file, sqlite3_vfs}; use std::ffi::c_void; use std::os::raw::{c_int, c_char}; pub use error::*; pub use vfs::{VFS,AccessCheck}; pub use vfsfile::VFSFile; pub use open_options::*; #[repr(C)] pub struct MojoFile { base: sqlite3_file, custom_file: *mut c_void, vfs: *mut sqlite3_vfs, } #[no_mangle] pub extern "C" fn mojo_create() -> *mut sqlite3_vfs { let mut name_buf = Vec::from("mojo".as_bytes()); name_buf.push(0); let fs_name_c = name_buf.as_ptr(); std::mem::forget(name_buf); let fs = Box::new(vfs::VFS::default()); let p_app_data = Box::into_raw(fs) as *mut c_void; //println!("p_app_data={:?}", p_app_data); //let sql_vfs = unsafe { // let raw_ptr = sqlite3_malloc64(std::mem::size_of::() as u64); // raw_ptr as *mut sqlite3_vfs //}; let vfs = Box::into_raw(Box::new(sqlite3_vfs{ iVersion: 1, szOsFile: (std::mem::size_of::()) as i32, mxPathname: 512, pNext: std::ptr::null_mut(), zName: fs_name_c as *const i8, pAppData: p_app_data, xOpen: Some(mojo_open), xDelete: Some(mojo_delete), xAccess: Some(mojo_access), xFullPathname: Some(mojo_fullname), xDlOpen: Some(mojo_dlopen), xDlError: Some(mojo_dlerror), xDlSym: Some(mojo_dlsym), xDlClose: Some(mojo_dlclose), xRandomness: Some(mojo_randomness), xSleep: Some(mojo_sleep), xCurrentTime: Some(mojo_current_time), xCurrentTimeInt64: Some(mojo_current_time64), xGetLastError: Some(mojo_getlasterr), xSetSystemCall: None, xGetSystemCall: None, xNextSystemCall: None, })); vfs } #[no_mangle] extern "C" fn mojo_open(vfs: *mut sqlite3_vfs, zname: *const c_char, file: *mut sqlite3_file, flags: c_int, out_flags: *mut c_int) -> c_int { let fs = getfs(vfs); let opt = open_options::OpenOptions::from_flags(flags as i32).unwrap(); let mut out_opt = opt.clone(); let file_str = if zname.is_null() { "" }else{ let file_rs = unsafe{std::ffi::CStr::from_ptr(zname)}; match file_rs.to_str() { Ok(file_str) => file_str, Err(_err) => { println!("mojo_open error in filepath"); return libsqlite3_sys::SQLITE_CANTOPEN }, } }; if opt.kind.is_main() { let query_map = match extract_query_params(zname) { Ok(map) => map, Err(_err) => { return libsqlite3_sys::SQLITE_CANTOPEN } }; if let Err(err) = fs.init(file_str, &query_map, opt.clone()) { log::error!("mojo_open init path={} err = {:?}", file_str, err); return libsqlite3_sys::SQLITE_CANTOPEN } } match fs.open(file_str, opt, &mut out_opt) { Ok(vfs_file) => { let mojo_file = unsafe {(file as *mut MojoFile).as_mut().unwrap()}; let io_methods = Box::into_raw(Box::new(libsqlite3_sys::sqlite3_io_methods{ iVersion: 1, xClose: Some(mojo_close), xRead: Some(mojo_read), xWrite: Some(mojo_write), xTruncate: Some(mojo_truncate), xSync: Some(mojo_sync), xFileSize: Some(mojo_filesize), xLock: Some(mojo_lock), xUnlock: Some(mojo_unlock), xCheckReservedLock: Some(mojo_check_reserved_lock), xFileControl: Some(mojo_file_control), xSectorSize: Some(mojo_sector_size), xDeviceCharacteristics: Some(mojo_device_char), xShmMap: None, xShmLock: None, xShmBarrier: None, xShmUnmap: None, xFetch: None, xUnfetch: None, })); mojo_file.base.pMethods = io_methods as *const libsqlite3_sys::sqlite3_io_methods; mojo_file.custom_file = Box::into_raw(vfs_file) as *mut c_void; mojo_file.vfs = vfs; }, Err(err) => { log::error!("mojo_open path={} err = {:?}", file_str, err); return libsqlite3_sys::SQLITE_CANTOPEN; } } unsafe { if !out_flags.is_null() { *out_flags = out_opt.flags; } }; libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_close(sfile: *mut sqlite3_file) -> c_int { unsafe { let mojo_file = (sfile as *mut MojoFile).as_ref().unwrap(); let vfs_file = Box::from_raw(mojo_file.custom_file as *mut VFSFile); let fs = getfs(mojo_file.vfs); match fs.close(*vfs_file) { Ok(_) => {}, Err(_err) => { return libsqlite3_sys::SQLITE_IOERR_CLOSE } } } libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_read(sfile: *mut sqlite3_file, ptr: *mut c_void, n: i32, off: i64) -> c_int { let file = get_file(sfile); let buf = unsafe{ std::slice::from_raw_parts_mut(ptr as *mut u8, n as usize)}; let rc = match file.pread(off as u64, buf) { Ok(n) => { //let m = n.min(20); //log::debug!("after read n={} {:?}", n, &buf[..m]); if n == buf.len() { libsqlite3_sys::SQLITE_OK }else{ //println!("short read"); libsqlite3_sys::SQLITE_IOERR_SHORT_READ } } Err(err) => { log::error!("mojo_read id={} off={} blen={} err={:?}", file.id(), off, buf.len(), err); libsqlite3_sys::SQLITE_IOERR_READ }, }; rc } #[no_mangle] extern "C" fn mojo_write(sfile: *mut sqlite3_file, ptr: *const c_void, n: i32, off: i64) -> c_int { let file = get_file_mut(sfile); let buf = unsafe{ std::slice::from_raw_parts(ptr as *const u8, n as usize)}; let rc = match file.pwrite(off as u64, buf) { Ok(_) => libsqlite3_sys::SQLITE_OK, Err(err) => { log::error!("mojo_write id={} off={} blen={} err={:?}", file.id(), off, buf.len(), err); libsqlite3_sys::SQLITE_IOERR_WRITE }, }; rc } #[no_mangle] extern "C" fn mojo_truncate(sfile: *mut sqlite3_file, new_sz: i64) -> c_int { let file = get_file_mut(sfile); let rc = match file.truncate(new_sz as u64) { Ok(_) => libsqlite3_sys::SQLITE_OK, Err(err) => { log::error!("mojo_truncate id={} new_sz={} err={:?}", file.id(), new_sz, err); libsqlite3_sys::SQLITE_IOERR_WRITE }, }; rc } #[no_mangle] extern "C" fn mojo_sync(sfile: *mut sqlite3_file, flags: i32) -> c_int { let file = get_file_mut(sfile); let rc = match file.sync(flags) { Ok(_) => libsqlite3_sys::SQLITE_OK, Err(err) => { log::error!("mojo_sync id={} err={:?}", file.id(), err); libsqlite3_sys::SQLITE_IOERR_WRITE }, }; rc } #[no_mangle] extern "C" fn mojo_filesize(sfile: *mut sqlite3_file, out_sz: *mut i64) -> c_int { let file = get_file(sfile); let rc = match file.filesize() { Ok(sz) => { unsafe{*out_sz = sz as i64;} libsqlite3_sys::SQLITE_OK } Err(_) => libsqlite3_sys::SQLITE_IOERR_WRITE, }; rc } #[no_mangle] extern "C" fn mojo_access(vfs: *mut sqlite3_vfs, zname: *const c_char, flags: c_int, resout: *mut c_int) -> c_int { let path = match c_to_path(zname) { Ok(path) => path, Err(_) => { return libsqlite3_sys::SQLITE_IOERR_CONVPATH; } }; let access_req = if flags == libsqlite3_sys::SQLITE_ACCESS_EXISTS { AccessCheck::Exists }else{ AccessCheck::ReadWrite }; let fs = getfs(vfs); match fs.access(&path, access_req) { Ok(status) => { unsafe{*resout = if status {1}else{0}} }, Err(_) => { return libsqlite3_sys::SQLITE_IOERR_ACCESS; } } libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_delete(vfs: *mut sqlite3_vfs, zname: *const c_char, _syncdir: c_int) -> c_int { let path = match c_to_path(zname) { Ok(path) => path, Err(_) => { return libsqlite3_sys::SQLITE_IOERR_DELETE; } }; let fs = getfs(vfs); match fs.delete(&path) { Ok(_) => { libsqlite3_sys::SQLITE_OK } Err(err) => { log::error!("mojo_delete path={:?} err={:?}", path, err); libsqlite3_sys::SQLITE_IOERR_DELETE } } } #[no_mangle] extern "C" fn mojo_fullname(vfs: *mut sqlite3_vfs, zname: *const c_char, _nout: c_int, _zout: *mut c_char) -> c_int { let fs = getfs(vfs); let file_rs = unsafe{std::ffi::CStr::from_ptr(zname)}; let file_str = match file_rs.to_str() { Ok(file_str) => file_str, Err(_err) => { log::error!("mojo_fullname error in filepath"); return libsqlite3_sys::SQLITE_CANTOPEN; } }; let _ = fs.fullpath(file_str); libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_dlopen(_arg1: *mut sqlite3_vfs, _zfilename: *const c_char) -> *mut c_void { std::ptr::null_mut() } #[no_mangle] extern "C" fn mojo_dlerror(_arg1: *mut sqlite3_vfs, _nbyte: c_int, _zerrmsg: *mut c_char) { } //extern "C" fn (_arg1: *mut sqlite3_vfs, _arg2: *mut c_void, _zSymbol: *const c_char) #[no_mangle] extern "C" fn mojo_dlsym(_arg1: *mut sqlite3_vfs, _arg2: *mut c_void, _zsymbol: *const c_char) -> Option { None } #[no_mangle] extern "C" fn mojo_dlclose(_arg1: *mut sqlite3_vfs, _arg2: *mut c_void) { } #[no_mangle] extern "C" fn mojo_randomness(_arg1: *mut sqlite3_vfs, _nbyte: c_int, _zout: *mut c_char) -> c_int { libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_sleep(_arg1: *mut sqlite3_vfs, microseconds: c_int) -> c_int { std::thread::sleep(std::time::Duration::from_micros(microseconds as u64)); libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_current_time(_arg1: *mut sqlite3_vfs, p: *mut f64) -> c_int { let now = (std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap()).as_secs() as f64; unsafe { *p = (2440587.5 + now / 864.0e5) * 864.0e5; } libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_current_time64(_arg1: *mut sqlite3_vfs, p: *mut i64) -> c_int { let now = (std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap()).as_secs() as f64; unsafe { *p = ((2440587.5 + now / 864.0e5) * 864.0e5) as i64; } libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_getlasterr(_arg1: *mut sqlite3_vfs, _arg2: c_int, _arg3: *mut c_char) -> c_int { libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_lock(_sfile: *mut sqlite3_file, _flags: c_int) -> c_int { libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_unlock(_sfile: *mut sqlite3_file, _flags: c_int) -> c_int { libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_check_reserved_lock(_sfile: *mut sqlite3_file, res_out: *mut c_int) -> c_int { unsafe{*res_out = 0;} libsqlite3_sys::SQLITE_OK } #[no_mangle] extern "C" fn mojo_file_control(_sfile: *mut sqlite3_file, _op: c_int, _arg: *mut c_void) -> c_int { libsqlite3_sys::SQLITE_NOTFOUND } #[no_mangle] extern "C" fn mojo_sector_size(_sfile: *mut sqlite3_file) -> c_int { 0 } #[no_mangle] extern "C" fn mojo_device_char(_sfile: *mut sqlite3_file) -> c_int { libsqlite3_sys::SQLITE_OK } fn getfs(vfs: *mut sqlite3_vfs) -> &'static mut VFS { let fs = unsafe{ ((*vfs).pAppData as *mut VFS).as_mut().unwrap() }; fs } fn get_file_mut(sfile: *mut sqlite3_file) -> &'static mut VFSFile { let file = unsafe{ let mojo_file = (sfile as *mut MojoFile).as_mut().unwrap(); (mojo_file.custom_file as *mut VFSFile).as_mut().unwrap() }; file } fn get_file(sfile: *mut sqlite3_file) -> &'static VFSFile { let file = unsafe{ let mojo_file = (sfile as *mut MojoFile).as_ref().unwrap(); (mojo_file.custom_file as *mut VFSFile).as_ref().unwrap() }; file } fn c_to_path(cpath: *const c_char) -> Result { let file_rs = unsafe{std::ffi::CStr::from_ptr(cpath)}; let file_str = file_rs.to_str()?; Ok(std::path::PathBuf::from(file_str)) } #[no_mangle] pub extern "C" fn mojofs_init_log() { env_logger::init(); } fn extract_query_params(filepath: *const c_char) -> Result, Error> { let mut map = HashMap::new(); if filepath.is_null() { return Ok(map); } let mut itr: *const c_char = filepath; let mut parse_key = true; let mut key: &str = "dummy"; let mut value: &str; unsafe { while key.len() > 0 { while *itr != 0 { itr = itr.add(1); } itr = itr.add(1); let s = CStr::from_ptr(itr).to_str()?; if parse_key { key = s; parse_key = false; }else{ value = s; parse_key = true; map.insert(key.to_owned(), value.to_owned()); } } } Ok(map) } ================================================ FILE: crates/mojofs/src/native_file.rs ================================================ use std::path::{Path, PathBuf}; use nix::fcntl::{self, OFlag}; use nix::sys::stat::Mode; use crate::open_options::*; use crate::Error; pub struct NativeFile { path: PathBuf, fd: i32, opt: OpenOptions, } impl NativeFile { pub fn open(path: &Path, opt: &OpenOptions) -> Result { let open_flags = match opt.access { OpenAccess::Read => { OFlag::O_RDONLY }, OpenAccess::Write => { OFlag::O_RDWR }, OpenAccess::Create => { OFlag::O_CREAT|OFlag::O_RDWR }, OpenAccess::CreateNew => { OFlag::O_CREAT|OFlag::O_RDWR|OFlag::O_EXCL } }; let file_perm = Mode::all(); let fd = fcntl::open(path, open_flags, file_perm)?; Ok(NativeFile{ path: path.to_owned(), opt: opt.clone(), fd, }) } pub fn pread(&self, buf: &mut [u8], off: i64) -> Result { log::debug!("native pread fd={} o={}, blen={}", self.fd, off, buf.len()); let mut i = 0; while i Result<(), Error> { log::debug!("native pwrite fd={} o={}, blen={}", self.fd, off, buf.len()); let mut i=0; while i Result<(), Error> { nix::unistd::close(self.fd)?; if self.opt.delete_on_close { nix::unistd::unlink(&self.path)?; } Ok(()) } pub fn sync(&mut self) -> Result<(), Error> { nix::unistd::fsync(self.fd)?; Ok(()) } pub fn filesize(&self) -> Result { let st = nix::sys::stat::fstat(self.fd)?; Ok(st.st_size as u64) } pub fn truncate(&mut self, new_sz: u64) -> Result<(), Error> { log::debug!("truncate id={} {}", self.fd, new_sz); nix::unistd::ftruncate(self.fd, new_sz as i64)?; Ok(()) } } ================================================ FILE: crates/mojofs/src/open_options.rs ================================================ use libsqlite3_sys as ffi; #[derive(Debug, Clone, PartialEq)] pub struct OpenOptions { pub flags: i32, /// The object type that is being opened. pub kind: OpenKind, /// The access an object is opened with. pub access: OpenAccess, /// The file should be deleted when it is closed. pub delete_on_close: bool, } /// The object type that is being opened. #[derive(Debug, Clone, Copy, PartialEq)] pub enum OpenKind { MainDb, MainJournal, TempDb, TempJournal, TransientDb, SubJournal, SuperJournal, Wal, } impl OpenKind { pub fn is_main(&self) -> bool { *self == OpenKind::MainDb } } /// The access an object is opened with. #[derive(Debug, Clone, Copy, PartialEq)] pub enum OpenAccess { /// Read access. Read, /// Write access (includes read access). Write, /// Create the file if it does not exist (includes write and read access). Create, /// Create the file, but throw if it it already exist (includes write and read access). CreateNew, } impl OpenOptions { pub fn from_flags(flags: i32) -> Option { Some(OpenOptions { flags, kind: OpenKind::from_flags(flags)?, access: OpenAccess::from_flags(flags)?, delete_on_close: flags & ffi::SQLITE_OPEN_DELETEONCLOSE > 0, }) } } impl OpenKind { fn from_flags(flags: i32) -> Option { match flags { flags if flags & ffi::SQLITE_OPEN_MAIN_DB > 0 => Some(Self::MainDb), flags if flags & ffi::SQLITE_OPEN_MAIN_JOURNAL > 0 => Some(Self::MainJournal), flags if flags & ffi::SQLITE_OPEN_TEMP_DB > 0 => Some(Self::TempDb), flags if flags & ffi::SQLITE_OPEN_TEMP_JOURNAL > 0 => Some(Self::TempJournal), flags if flags & ffi::SQLITE_OPEN_TRANSIENT_DB > 0 => Some(Self::TransientDb), flags if flags & ffi::SQLITE_OPEN_SUBJOURNAL > 0 => Some(Self::SubJournal), flags if flags & ffi::SQLITE_OPEN_SUPER_JOURNAL > 0 => Some(Self::SuperJournal), flags if flags & ffi::SQLITE_OPEN_WAL > 0 => Some(Self::Wal), _ => None, } } } impl OpenAccess { fn from_flags(flags: i32) -> Option { match flags { flags if (flags & ffi::SQLITE_OPEN_CREATE > 0) && (flags & ffi::SQLITE_OPEN_EXCLUSIVE > 0) => { Some(Self::CreateNew) } flags if flags & ffi::SQLITE_OPEN_CREATE > 0 => Some(Self::Create), flags if flags & ffi::SQLITE_OPEN_READWRITE > 0 => Some(Self::Write), flags if flags & ffi::SQLITE_OPEN_READONLY > 0 => Some(Self::Read), _ => None, } } } ================================================ FILE: crates/mojofs/src/vfs.rs ================================================ use std::path::{PathBuf, Path}; use crate::{error, Error}; use crate::open_options::*; use std::collections::HashMap; use crate::kvfile::{KVFile, KVFileOpt}; use crate::vfsfile::FileImpl; use mojokv::{Store, BucketOpenMode}; use crate::vfsfile::VFSFile; #[derive(Debug)] pub enum AccessCheck { Exists, ReadWrite, } #[derive(Default)] pub struct VFS { store: Option, file_counter: usize, fopt: FSOptions, } impl VFS { pub fn name(&self) -> String { return "mojo".to_owned() } pub fn fs_options(&self) -> FSOptions { self.fopt.clone() } pub fn active_ver(&self) -> u32 { self.store.as_ref().unwrap().active_ver() } pub fn init(&mut self, root_path: &str, params: &HashMap, opt: OpenOptions) -> Result<(), Error> { log::debug!("init: root_path={} params={:?} opt={:?}", root_path, params, opt); self.fopt = FSOptions::parse(params)?; let root_path = Path::new(root_path); if opt.access == OpenAccess::Read { self.store = Some(Store::readonly(root_path, self.fopt.ver)?); log::debug!("store opened in readonly mode"); }else{ self.store = Some(Store::writable(root_path, true, Some(self.fopt.pagesz), Some(self.fopt.pps))?); log::debug!("store opened writable mode"); } Ok(()) } pub fn open(&mut self, filepath: &str, opt: OpenOptions, _out_opt: &mut OpenOptions) -> Result, Error> { log::debug!("open: file={} opt={:?}", filepath, opt); self.file_counter += 1; let id = self.file_counter; let file_path = if filepath.len() == 0{ std::path::PathBuf::from(format!("mojo.tmp.{}", id)) }else{ std::path::PathBuf::from(filepath) }; let store = self.store.as_mut().unwrap(); let bucket_name = Self::bucket_name(&file_path); let bmode = if let OpenAccess::Read = opt.access { BucketOpenMode::Read }else{ BucketOpenMode::Write }; let b = store.open(bucket_name, bmode)?; let kvfileopt = self.fopt.to_kvfile_opt(); let f = KVFile::open(b, kvfileopt)?; let fimpl = FileImpl::KV(f); log::debug!("open: file={} id={} done", filepath, id); Ok(Box::new(VFSFile::new(id, bucket_name, opt, fimpl))) } pub fn fullpath(&mut self, filepath: &str) -> Result { log::debug!("fullpath filepath={}", filepath); let filepath_rs = std::path::Path::new(filepath); if filepath_rs.is_absolute() { Ok(filepath_rs.to_owned()) }else{ let mut cwd = std::env::current_dir()?; cwd.push(filepath_rs); Ok(cwd) } } fn bucket_name(path: &Path) -> &str { path.file_name().unwrap().to_str().unwrap() } //TODO: add sync dir pub fn delete(&mut self, path: &std::path::Path) -> Result<(), Error> { log::debug!("delete path={:?}", path); let name = Self::bucket_name(path); let store = self.store.as_mut().unwrap(); store.delete(name)?; Ok(()) } pub fn access(&self, path: &std::path::Path, req: AccessCheck) -> Result { log::debug!("access path={:?} req={:?}", path, req); let name = Self::bucket_name(path); let store = self.store.as_ref().unwrap(); let status = store.exists(name); log::debug!("access path={:?} status={}", path, status); Ok(status) } pub fn close(&mut self, f: VFSFile) -> Result<(), Error> { let fid = f.id(); log::debug!("close id={}", f.id()); let bucket_name = f.bucket.clone(); let opt = f.opt(); f.close()?; let store = self.store.as_mut().unwrap(); if opt.delete_on_close { log::debug!("close_on_delete is set for id={}", fid); store.delete(bucket_name.as_str())?; } Ok(()) } pub fn commit(&mut self) -> Result<(), Error> { let store = self.store.as_mut().unwrap(); store.commit()?; Ok(()) } } #[derive(Default, Clone)] pub struct FSOptions { pub ver: u32, pub pagesz: u32, pub pps: u32, } impl FSOptions { fn parse(map: &HashMap) -> Result { let mut opt = FSOptions {ver: 0, pagesz: 4096, pps: 0}; opt.ver = match map.get("ver") { Some(s) => s.parse()?, None => 1 }; let s = map.get("pagesz").ok_or(Error::new(error::MOJOFS_ERR_ARG_PAGESZ_MISSING, "fs arg page size missing".to_owned()))?; opt.pagesz = s.parse()?; opt.pps = match map.get("pps") { Some(s) => s.parse()?, None => 65536 }; Ok(opt) } fn to_kvfile_opt(&self) -> KVFileOpt { KVFileOpt { ver: self.ver, page_sz: self.pagesz, pps: self.pps, } } } ================================================ FILE: crates/mojofs/src/vfsfile.rs ================================================ use crate::error::Error; use crate::native_file::NativeFile; use crate::kvfile::KVFile; use crate::open_options::OpenOptions; pub enum FileImpl { Reg(NativeFile), KV(KVFile) } pub struct VFSFile { pub bucket: String, id: usize, fimpl: FileImpl, opt: OpenOptions, } impl VFSFile { pub fn new(id: usize, name: &str, opt: OpenOptions, fimpl: FileImpl) -> Self { VFSFile{ bucket: name.to_owned(), id, fimpl, opt, } } pub fn id(&self) -> usize { self.id } pub fn opt(&self) -> OpenOptions { self.opt.clone() } pub fn pread(&self, off: u64, buf: &mut [u8]) -> Result { log::debug!("pread id={} o={}, blen={}", self.id, off, buf.len()); let nread = match &self.fimpl { FileImpl::Reg(f) => { f.pread(buf, off as i64)? }, FileImpl::KV(f) => { f.pread(buf, off as i64)? } }; Ok(nread) } pub fn pwrite(&mut self, off: u64, buf: &[u8]) -> Result<(), Error> { log::debug!("pwrite id={} o={}, blen={}", self.id, off, buf.len()); match &mut self.fimpl { FileImpl::Reg(f) => { f.pwrite(off as i64, buf)?; }, FileImpl::KV(f) => { f.pwrite(off as i64, buf)?; } }; Ok(()) } pub fn close(self) -> Result<(), Error> { log::debug!("file close id={}", self.id); match self.fimpl { FileImpl::Reg(f) => { f.close()?; }, FileImpl::KV(f) => { f.close()?; } }; Ok(()) } pub fn sync(&mut self, flags: i32) -> Result<(), Error> { log::debug!("sync id={} flags={}", self.id, flags); match &mut self.fimpl { FileImpl::Reg(f) => { f.sync()?; }, FileImpl::KV(f) => { f.sync()?; } }; Ok(()) } pub fn filesize(&self) -> Result { let sz = match &self.fimpl { FileImpl::Reg(f) => { f.filesize()? }, FileImpl::KV(f) => { f.filesize()? } }; Ok(sz) } pub fn truncate(&mut self, new_sz: u64) -> Result<(), Error> { log::debug!("truncate id={} {}", self.id, new_sz); match &mut self.fimpl { FileImpl::Reg(f) => { f.truncate(new_sz)?; }, FileImpl::KV(f) => { f.truncate(new_sz)?; } }; Ok(()) } pub fn lock(&mut self, _flag: i32) -> Result<(), Error> { Ok(()) } pub fn unlock(&mut self, _flag: i32) -> Result<(), Error> { Ok(()) } pub fn check_reserved_lock(&self) -> Result { Ok(0) } pub fn file_control(&mut self, _op: i32) -> Result<(), Error> { Ok(()) } pub fn sector_size(&self) -> Result { Ok(0) } pub fn device_char(&self) -> Result<(), Error> { Ok(()) } } ================================================ FILE: crates/mojofs/tests/mojofs_test.rs ================================================ use std::path::Path; use anyhow::Error; use std::collections::HashMap; use mojofs::{self, VFS, VFSFile}; fn remove_fs(rootpath: &Path) -> Result<(), Error> { if let Err(err) = std::fs::remove_dir_all(rootpath) { if err.kind() != std::io::ErrorKind::NotFound { return Err(err.into()); } } Ok(()) } fn setup() -> Result { let path = Path::new("./testfs"); remove_fs(path)?; Ok(path.to_owned().to_str().unwrap().to_owned()) } fn default_params(pagesz: u32) -> HashMap { let mut h = HashMap::new(); h.insert("ver".to_owned(), "1".to_owned()); h.insert("pagesz".to_owned(), format!("{}", pagesz)); h.insert("pps".to_owned(), "65536".to_owned()); h } fn read_test(a: &mut VFSFile, nitems: usize, pagesz: u64, f: fn(usize) -> usize) -> Result<(), Error> { let mut buf = [0u8; 8]; for i in 0usize..nitems { let off = i as u64 * pagesz; let n = f(i); a.pread(off, &mut buf)?; assert_eq!(n, usize::from_be_bytes(buf)); } Ok(()) } fn write_test(a: &mut VFSFile, nitems: usize, pagesz: u64, f: fn(usize) -> usize) -> Result<(), Error> { for i in 0usize..nitems { let off = i as u64 * pagesz; let n = f(i); a.pwrite(off, &n.to_be_bytes())?; } a.sync(1)?; Ok(()) } fn write_read(a: &mut VFSFile, nitems: usize, pagesz: u64, f: fn(usize) -> usize) -> Result<(), Error> { write_test(a, nitems, pagesz, f)?; read_test(a, nitems, pagesz, f)?; Ok(()) } #[test] fn rw_same_version() -> Result<(), Error> { env_logger::init(); let fspath = setup()?; let mut fs_uri_opt = default_params(8); let mut opt = mojofs::OpenOptions::from_flags(326).unwrap(); let nitems = 10; { let mut fs = VFS::default(); fs.init(&fspath, &fs_uri_opt, opt.clone())?; let fsopt = fs.fs_options(); let mut a = fs.open("a", opt.clone(), &mut opt)?; assert_eq!(fsopt.pagesz, 8); write_read(&mut a, nitems, fsopt.pagesz as u64, |n| n)?; assert_eq!(a.filesize()?, (fsopt.pagesz as u64) * nitems as u64); a.close()?; fs.commit()?; assert_eq!(fs.active_ver(), 2); let mut a = fs.open("a", opt.clone(), &mut opt)?; write_read(&mut a, nitems, fsopt.pagesz as u64, |n| n+10)?; assert_eq!(a.filesize()?, (fsopt.pagesz as u64) * nitems as u64); a.close()?; fs.commit()?; assert_eq!(fs.active_ver(), 3); } { let mut fs = VFS::default(); opt.access = mojofs::OpenAccess::Read; fs_uri_opt.insert("ver".to_owned(), "1".to_owned()); fs.init(&fspath, &fs_uri_opt, opt.clone())?; let fsopt = fs.fs_options(); let mut a = fs.open("a", opt.clone(), &mut opt)?; read_test(&mut a, nitems, fsopt.pagesz as u64, |n| n)?; } { let mut fs = VFS::default(); opt.access = mojofs::OpenAccess::Read; fs_uri_opt.insert("ver".to_owned(), "2".to_owned()); fs.init(&fspath, &fs_uri_opt, opt.clone())?; let fsopt = fs.fs_options(); let mut a = fs.open("a", opt.clone(), &mut opt)?; read_test(&mut a, nitems, fsopt.pagesz as u64, |n| n+10)?; } Ok(()) } ================================================ FILE: crates/mojoio/Cargo.toml ================================================ [package] name = "mojoio" version = "0.1.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] nix = "0.24" log = "0.4.17" thiserror = "1.0.31" parking_lot = {version = "0.12", features=["serde"]} ================================================ FILE: crates/mojoio/src/error.rs ================================================ #[derive(thiserror::Error, Debug)] pub enum Error { #[error("io error")] IoErr(#[from] std::io::Error), #[error("Unix error")] NixErr(#[from] nix::Error), #[error("Parse int error")] ParseIntErr(#[from] std::num::ParseIntError), #[error("Unknown error `{0}`")] UnknownStr(String), #[error("UTF8 error")] UTF8Err(#[from] std::str::Utf8Error), #[error("Unknown error")] Unknown, } ================================================ FILE: crates/mojoio/src/lib.rs ================================================ pub mod nix; mod error; pub use error::Error; pub const BUFFER_MAGIC: &[u8] = b"mojo"; pub const PAGE_HEADER_LEN: usize = 8; pub fn add(left: usize, right: usize) -> usize { left + right } #[cfg(test)] mod tests { use super::*; #[test] fn it_works() { let result = add(2, 2); assert_eq!(result, 4); } } ================================================ FILE: crates/mojoio/src/nix.rs ================================================ use std::path::Path; use nix::fcntl::{self, OFlag}; use nix::sys::stat::Mode; //use crate::value; use crate::Error; pub struct NixFile { file_fd: i32, curr_off: u64, page_header_buf: [u8; crate::PAGE_HEADER_LEN], page_header: PageHeader, } impl NixFile { pub fn open(filepath: &Path, _file_no: u32) -> Result { let open_flags = OFlag::O_CREAT|OFlag::O_RDWR; let file_perm = Mode::all(); let file_fd = fcntl::open(filepath, open_flags, file_perm)?; let curr_off = nix::unistd::lseek(file_fd, 0, nix::unistd::Whence::SeekEnd)? as u64; log::debug!("open path={:?} fd={}", filepath, file_fd); Ok(NixFile { file_fd, curr_off, page_header_buf: [0; crate::PAGE_HEADER_LEN], page_header: PageHeader::new(), }) } pub fn close(&mut self) -> Result<(), Error> { log::debug!("close fd={}", self.file_fd); nix::unistd::close(self.file_fd)?; Ok(()) } pub fn write_buf_at(&mut self, off: u64, block_no: u32, buf: &[u8]) -> Result<(), Error> { self.page_header.block_no = block_no; self.page_header.encode(&mut self.page_header_buf); let header_io = std::io::IoSlice::new(&self.page_header_buf); let buf_io = std::io::IoSlice::new(buf); let io_bufs = [header_io, buf_io]; log::debug!("file write at fd={} off={} {}", self.file_fd, off, buf.len()); let n = nix::sys::uio::pwritev(self.file_fd, &io_bufs, off as i64)?; if n < header_io.len() + buf_io.len() { return Err(Error::UnknownStr("vectored write did not write all data".to_owned())); } Ok(()) } pub fn write_buf(&mut self, block_no: u32, poff: u64, buf: &[u8]) -> Result { self.write_buf_at(self.curr_off, block_no, buf)?; let page_off = self.curr_off; //let page_off = self.curr_off; self.curr_off += buf.len() as u64 + NixFile::header_len() as u64; self.curr_off += poff; //self.curr_off += buf.len() as u64; Ok(page_off) } pub fn header_len() -> usize { return crate::PAGE_HEADER_LEN; } fn read_all_at(&self, off: u64, buf: &mut [u8]) -> Result { let mut i = 0; while i < buf.len() { //TODO: Should n==0 be handled? let n = nix::sys::uio::pread(self.file_fd, &mut buf[i..], off as i64 + i as i64)?; if n == 0 { break; } i += n; } Ok(i) } pub fn read_buf_at(&self, off: u64, buf: &mut [u8]) -> Result { log::debug!("file read at fd={} off={} {}", self.file_fd, off, buf.len()); let n = self.read_all_at(off, buf)?; Ok(n) } pub fn sync(&self) -> Result<(), Error> { log::debug!("sync fd={}", self.file_fd); nix::unistd::fsync(self.file_fd)?; Ok(()) } } struct PageHeader { magic: &'static [u8], block_no: u32, } impl PageHeader { fn new() -> PageHeader { PageHeader { magic: crate::BUFFER_MAGIC, block_no: 0, } } pub fn encode(&mut self, buf: &mut [u8; crate::PAGE_HEADER_LEN]) { let _ = &buf[..4].copy_from_slice(self.magic); let _ = &buf[4..8].copy_from_slice(&self.block_no.to_le_bytes()); //let _ = &buf[12..13].copy_from_slice(&self.flags.to_le_bytes()); //let _ = &buf[13..17].copy_from_slice(&self.file_no.to_le_bytes()); } /* pub fn decode(buf: &[u8]) -> Result { let magic = &buf[..4]; if magic != BUFFER_MAGIC { return Err(Error::UnknownStr("Invalid buffer magic".to_string())); } let mut tmp_buf = [0u8; 8]; tmp_buf.copy_from_slice(&buf[4..12]); let block_no = u64::from_be_bytes(tmp_buf); let mut tmp_buf = [0u8; 2]; tmp_buf.copy_from_slice(&buf[12..14]); let size = u16::from_be_bytes(tmp_buf); tmp_buf.copy_from_slice(&buf[12..14]); let flags = buf[15]; let mut tmp_buf = [0u8; 4]; tmp_buf.copy_from_slice(&buf[15..19]); let file_no = u32::from_be_bytes(tmp_buf); Ok(PageHeader{ magic: BUFFER_MAGIC, block_no, size, flags, file_no, }) } */ } ================================================ FILE: crates/mojokv/.gitignore ================================================ /target /Cargo.lock *.db log* *.log ================================================ FILE: crates/mojokv/Cargo.toml ================================================ [package] name = "mojokv" version = "0.2.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] mojoio = {path="../mojoio"} modular-bitfield = "0.11" lz4 = "1.23.3" zstd = "0.11.2" thiserror = "1.0.31" log = "0.4.17" env_logger = "0.9.0" rand = "0.8.5" clap = {version="3.2.6", features=["derive"] } anyhow = "1.0.58" parking_lot = {version = "0.12", features=["serde"]} serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1.0" rmp-serde = "1.1.0" fslock = "0.2.1" rustc-hash = "1.1.0" ================================================ FILE: crates/mojokv/src/bmap.rs ================================================ use std::path::{Path, PathBuf}; use std::collections::HashMap; use std::sync::Arc; use crate::bucket::Bucket; use crate::Error; use parking_lot::RwLock; use serde::{Serialize, Deserialize}; #[derive(Clone, Default, Debug, Serialize, Deserialize)] pub struct BucketMap { map: Arc>>, } impl BucketMap { pub fn add(&self, name: &str, ver: u32) { log::debug!("add name={} ver={} {:?}", name, ver, self.map); let mut map = self.map.write(); //self.buckets.insert(name.to_owned(), b); map.insert(name.to_owned(), ver); } pub fn exists(&self, name: &str) -> bool { let map = self.map.read(); map.contains_key(name) } pub fn get(&self, name: &str) -> Option{ log::debug!("get name={}", name); let map = self.map.read(); map.get(name).map(|v| *v) } pub fn delete(&self, root_path: &Path, name: &str, ver: u32) -> Result<(), Error> { log::debug!("delete name={} {:?}", name, self.map); let mut map = self.map.write(); map.remove(name); Bucket::delete_ver(root_path, name, ver)?; Ok(()) } pub fn map(&self) -> Result, Error> { let map = self.map.read(); Ok(map.clone()) } pub fn serialize_to_path(&self, path: &Path) -> Result<(), Error> { let buf = serde_json::to_vec(&self)?; log::debug!("serializing bmap={:?}", std::str::from_utf8(&buf)); crate::utils::write_file(path, &buf)?; Ok(()) } pub fn deserialize_from_path(path: &Path) -> Result { let mut buf = Vec::new(); crate::utils::load_file(path, &mut buf)?; let map = serde_json::from_slice(&buf)?; Ok(map) } fn bmap_path(root_path: &Path, ver: u32) -> PathBuf { root_path.join(&format!("mojo.bmap.{}", ver)) } pub fn load(root_path: &Path, ver: u32) -> Result { let bmap_path = Self::bmap_path(root_path, ver); log::debug!("loading bmap from path={:?}", bmap_path); let bmap = Self::deserialize_from_path(&bmap_path)?; Ok(bmap) } } ================================================ FILE: crates/mojokv/src/bucket.rs ================================================ use std::collections::HashSet; use std::path::{Path, PathBuf}; use crate::{Error, BucketMap}; use mojoio::nix::NixFile; use crate::index::mem::MemIndex; use crate::value::Value; use crate::state::State; pub struct BucketInner { name: String, root_path: PathBuf, index: MemIndex, file_page_sz: usize, fmap: FileMap, is_dirty: bool, is_modified: bool, is_closed: bool, active_ver: u32, } impl BucketInner { fn active_file(&mut self, ver: u32) -> &mut NixFile { self.fmap.file_mut(ver) } fn sync_index(&mut self, ver: u32) -> Result<(), Error> { let non_ref_vers =self.index.update_min_max_ver(); log::debug!("closing versions={:?} as they are no longer referenced", non_ref_vers); self.fmap.close_versions(&non_ref_vers, self.active_ver)?; let index_path = Bucket::index_path(&self.root_path, self.name.as_str(), ver); log::debug!("syncing index ver={} {:?}", ver, index_path); self.index.serialize_to_path(&index_path)?; log::debug!("syncing index ver={} done", ver); Ok(()) } } pub struct Bucket { state: State, //inner: Arc>, inner: BucketInner, bmap: BucketMap, is_write: bool, } impl Bucket { fn with_inner(state: State, inner: BucketInner, bmap: BucketMap) -> Self { Bucket { state, //inner: Arc::new(RwLock::new(inner)), inner, bmap, is_write: false, } } pub fn set_writable(&mut self) { self.is_write = true } pub fn readonly(root_path: &Path, name: &str, ver: u32, state: State, bmap: BucketMap) -> Result { log::debug!("bucket name={} readonly at ver={}", name, ver); let b = Self::load(root_path, name, state, bmap, ver)?; Ok(b) } fn index_path(rootpath: &Path, name: &str, ver: u32) -> PathBuf { rootpath.join(&format!("{}_i.{}", name, ver)) } pub fn get_key(&self, key: u32) -> Result, Error> { //let inner = self.inner.read(); Ok(self.inner.index.get(key)?.map(|v| v.clone())) } pub fn max_key(&self) -> isize { //let inner = self.inner.read(); self.inner.index.max_key() } pub fn is_modified(&self) -> bool { //let inner = self.inner.read(); self.inner.is_modified } pub fn writable(root_path: &Path, name: &str, state: State, bmap: BucketMap, load_ver: u32) -> Result { log::debug!("mojo initing bucket pps={}", state.pps()); let aver = state.active_ver(); let index_path = Self::index_path(root_path, name, load_ver); let mut b = if index_path.exists() { log::debug!("bucket index for version={} exists", load_ver); Self::load(root_path, name, state, bmap, aver)? }else{ log::debug!("creating new bucket at ver={}", aver); let mut b = Self::new(root_path, name, state, bmap)?; b.sync()?; b }; b.set_writable(); log::debug!("mojo state={:?}", b.state); Ok(b) } pub fn load(root_path: &Path, name: &str, state: State, bmap: BucketMap, ver: u32) -> Result { log::debug!("loading bucket={} version={}", name, ver); if ver < state.min_ver() || ver > state.active_ver() { return Err(Error::VersionNotFoundErr(ver)); } let (_, _, mut index) = Self::load_index(root_path, name, ver)?; let fmap = FileMap::init(root_path, name, &index.header().vset, state.active_ver())?; index.set_active_ver(state.active_ver()); let file_page_sz = state.page_size() as usize + NixFile::header_len(); let inner = BucketInner { name: name.to_owned(), root_path: root_path.to_owned(), index, file_page_sz, fmap, is_dirty: false, is_modified: false, is_closed: false, active_ver: state.active_ver(), }; log::debug!("mojo load version done"); Ok(Bucket::with_inner(state, inner, bmap)) } pub fn load_index(root_path: &Path, name: &str, ver: u32) -> Result<(usize, usize, MemIndex), Error> { let index_path = Self::index_path(root_path, name, ver); log::debug!("loading index={:?} for name={} at ver={}", index_path, name, ver); if !index_path.exists() { return Err(Error::BucketNotAtVerErr(name.to_owned(), ver)); } let index = MemIndex::deserialize_from_path(&index_path)?; Ok(index) } pub fn new(root_path: &Path, name: &str, state: State, bmap: BucketMap) -> Result { log::debug!("creating new bucket name={} at ver={}", name, state.active_ver()); let _ = std::fs::create_dir_all(root_path)?; let index = MemIndex::new(state.pps() as usize); let fmap = FileMap::init(root_path, name, &index.header().vset, state.active_ver())?; let mut inner = BucketInner { name: name.to_owned(), root_path: root_path.to_owned(), index, file_page_sz: state.page_size() as usize + NixFile::header_len(), fmap, is_dirty: false, is_modified: false, is_closed: false, active_ver: state.active_ver(), }; inner.index.set_active_ver(state.active_ver()); let b = Bucket::with_inner(state, inner, bmap); Ok(b) } pub fn logical_size(&self) -> u64 { //let inner = self.inner.read(); (self.state.page_size() as isize * (self.inner.index.max_key() + 1)) as u64 } pub fn close(mut self) -> Result<(), Error> { //let mut inner = self.inner.write(); if self.inner.is_closed { return Ok(()) } self.inner.fmap.close()?; self.inner.is_closed = true; Ok(()) } pub fn truncate(&mut self, new_sz: usize) -> Result<(), Error> { let _ = self.state.commit_lock.read(); //let mut inner = self.inner.write(); log::debug!("truncate bucket={} new_sz={}", self.inner.name, new_sz); let pages = new_sz/(self.state.page_size() as usize); //let real_sz = pages * self.file_page_sz; self.inner.index.truncate(pages as u32)?; self.inner.is_modified = true; //TODO: Delete blocks from file //self.active_file().truncate(real_sz)?; Ok(()) } fn put_at(&mut self, key: u32, page_off: u64, buf: &[u8], val: &Value) -> Result<(), Error> { let mut off = val.get_off() as u64; off = off * self.inner.file_page_sz as u64; off += page_off; let file = self.inner.active_file(self.state.active_ver()); file.write_buf_at(off, key, buf)?; Ok(()) } pub fn put(&mut self, key: u32, page_off: u64, buf: &[u8]) -> Result<(), Error> { if !self.is_write { return Err(Error::BucketNotWritableErr); } if self.inner.active_ver < self.state.active_ver() { return Err(Error::VerNotWritable(self.inner.active_ver, self.state.active_ver())); } let _ = self.state.commit_lock.read(); log::debug!("store put aver={} key={}, buflen={}", self.state.active_ver(), key, buf.len()); let val_opt = self.get_value_opt(key)?.map(|v| v.clone()); match val_opt { Some(val) => { //let mut inner = self.inner.write(); log::debug!("store put value exists value={:?}", val); if val.get_ver() == self.state.active_ver() { self.put_at(key, page_off, buf, &val)?; self.inner.index.put(key, val.get_off())?; }else{ let file = self.inner.active_file(self.state.active_ver()); let write_off = file.write_buf(key, page_off, buf)?; let block_no = (write_off/(self.inner.file_page_sz as u64)) as u32; log::debug!("bucket put was done at block_no={} old value={:?}", block_no, val); self.inner.index.put(key, block_no)?; } self.inner.is_dirty = true; self.inner.is_modified = true; }, None => { //let mut inner = self.inner.write(); let file = self.inner.active_file(self.state.active_ver()); let write_off = file.write_buf(key, page_off, buf)?; let block_no = (write_off/(self.inner.file_page_sz as u64)) as u32; self.inner.index.put(key, block_no)?; self.inner.is_dirty = true; self.inner.is_modified = true; log::debug!("store put value not present. value={:?}", block_no); } } Ok(()) } pub fn get(&self, key: u32, page_off: u64, out_buf: &mut [u8]) -> Result { //let inner = self.inner.read(); let value = self.get_value(key)?; let mut read_off = (value.get_off() as u64) * (self.inner.file_page_sz as u64); read_off += NixFile::header_len() as u64 + page_off; let read_ver = value.get_ver(); log::debug!("get name={} key={} value: {:?}", self.inner.name, key, value); let file = self.inner.fmap.file(read_ver); let n = file.read_buf_at(read_off, out_buf)?; log::debug!("get name={} key={} n={}", self.inner.name, key, n); Ok(n) } fn get_value_opt(&self, key: u32) -> Result, Error> { //let inner = self.inner.read(); match self.inner.index.get(key)? { None => { log::debug!("get_value_opt no slot key={}", key); return Ok(None) } Some(val) => { if !val.is_allocated() { log::debug!("get_value_opt allocated key={}", key); return Ok(None) }else{ return Ok(Some(val.clone())) } } } } fn get_value(&self, key: u32) -> Result { self.get_value_opt(key)?.ok_or(Error::KeyNotFoundErr(key)) } pub (crate) fn sync_no_commit_lock(&mut self) -> Result<(), Error> { if !self.is_write { return Err(Error::StoreNotWritableErr); } //let mut inner = self.inner.write(); log::debug!("syncing bucket={} at ver={}", self.inner.name, self.state.active_ver()); self.bmap.add(&self.inner.name, self.state.active_ver()); self.inner.active_file(self.state.active_ver()).sync()?; self.inner.sync_index(self.state.active_ver())?; self.inner.is_dirty = false; log::debug!("syncing done"); Ok(()) } pub fn sync(&mut self) -> Result<(), Error> { let _ = self.state.commit_lock.read(); self.sync_no_commit_lock() } pub fn delete_ver(root_path: &Path, name: &str, ver: u32) -> Result<(), Error> { log::debug!("Deleting bucket name={} ver={}", name, ver); let index_path = Self::index_path(root_path, name, ver); log::debug!("removing index file={:?}", index_path); std::fs::remove_file(index_path)?; let data_path = FileMap::data_path(&root_path, name, ver); log::debug!("removing data file={:?}", data_path); std::fs::remove_file(data_path)?; Ok(()) } } struct FileMap { fmap: rustc_hash::FxHashMap, } impl FileMap { fn init(root_path: &Path, name: &str, vset: &HashSet, aver: u32) -> Result { //let active_file = Self::open_active_file(root_path, name, active_ver)?; log::debug!("fmap initing for name={} with vset={:?}", name, vset); let mut fmap = FileMap { fmap: rustc_hash::FxHashMap::default(), }; for ver in vset.iter() { if *ver != aver { fmap.add_file(root_path, name, *ver)?; } } fmap.add_file(root_path, name, aver)?; Ok(fmap) } fn close(&mut self) -> Result<(), Error> { for (_v, f) in &mut self.fmap { f.close()?; } Ok(()) } fn close_versions(&mut self, vlist: &Vec, aver: u32) -> Result<(), Error> { for v in vlist { if *v == aver { continue } if let Some(mut f) = self.fmap.remove(v) { f.close()?; } } Ok(()) } fn data_path(root_path: &Path, name: &str, ver: u32) -> PathBuf { root_path.join(format!("{}_d.{}", name, ver)) } fn add_file(&mut self, root_path: &Path, name: &str, ver: u32) -> Result<(), Error> { let ver_path = Self::data_path(root_path, name, ver); log::debug!("adding new file: {:?}", ver_path); let f = NixFile::open(&ver_path, ver)?; self.fmap.insert(ver, f); Ok(()) } fn file_mut(&mut self, ver: u32) -> &mut NixFile { self.fmap.get_mut(&ver).expect(&format!("write ver={} not found", ver)) } fn file(&self, ver: u32) -> &NixFile { &self.fmap.get(&ver).expect(&format!("read ver={} not found", ver)) } } ================================================ FILE: crates/mojokv/src/error.rs ================================================ #[derive(thiserror::Error, Debug)] pub enum Error { #[error("io error")] IoErr(#[from] std::io::Error), #[error("mojo file error")] MojoFileErr(#[from] mojoio::Error), #[error("Bucket {0} not found at ver={1}")] BucketNotAtVerErr(String, u32), #[error("Bucket not writable")] BucketNotWritableErr, #[error("Version no longer writable bucket ver={0} active ver={1}")] VerNotWritable(u32, u32), #[error("Store not found")] StoreNotFoundErr, #[error("Store not writable")] StoreNotWritableErr, #[error("Missing arguments")] MissingArgsErr, #[error("Commit lock could not be acquired")] CommitLockedErr, #[error("Only single version exists")] SingleVersionErr, #[error("Json serialization error")] SerdeJsonErr(#[from] serde_json::Error), #[error("rmp encode error")] RmpEncodeErr(#[from] rmp_serde::encode::Error), #[error("rmp decode error")] RmpDecodeErr(#[from] rmp_serde::decode::Error), #[error("Key {0} not found")] KeyNotFoundErr(u32), #[error("Key {0} not multiple of page size")] KeyNotMultipleErr(u32), #[error("Version {0} not found")] VersionNotFoundErr(u32), #[error("Parse int error")] ParseIntErr(#[from] std::num::ParseIntError), #[error("Unknown error `{0}`")] UnknownStr(String), #[error("UTF8 error")] UTF8Err(#[from] std::str::Utf8Error), #[error("Unknown error")] Unknown, } ================================================ FILE: crates/mojokv/src/index/mem.rs ================================================ use std::io::Write; use serde::Deserialize; use serde::Serialize; use crate::value::Value; use crate::keymap::KeyMap; use crate::Error; use crate::utils; use super::IndexHeader; //TODO: Reserve some space for additional data #[derive(Serialize, Deserialize)] pub struct MemIndex { header: IndexHeader, pub kmap: KeyMap } impl MemIndex { pub fn new(pps: usize) -> Self { MemIndex { header: IndexHeader::new(pps), kmap: KeyMap::new(pps), } } pub fn header(&self) -> &IndexHeader { &self.header } fn key_map(&self) -> &KeyMap { &self.kmap } pub fn set_active_ver(&mut self, ver: u32) { self.header.vset.insert(ver); self.header.active_ver = ver; } pub fn active_ver(&self) -> u32 { self.header.active_ver } pub fn max_key(&self) -> isize { self.header.max_key } pub fn update_min_max_ver(&mut self) -> Vec { let (min_ver, max_ver, vset) = self.kmap.get_min_max_ver(); self.header.min_ver = min_ver; self.header.max_ver = max_ver; let non_ref_vers: Vec = self.header.vset.difference(&vset).map(|n| *n).collect(); non_ref_vers } pub fn put(&mut self, key: u32, off: u32) -> Result<(), Error> { let mut val = Value::new(); val.put_off(off); val.put_ver(self.header.active_ver); log::debug!("index put val:{:?}", val); self.header.max_key = self.header.max_key.max(key as isize); self.kmap.put(key, val); Ok(()) } pub fn get(&self, key: u32) -> Result, Error> { Ok(self.kmap.get(key)) } pub fn truncate(&mut self, key: u32) -> Result<(), Error> { self.kmap.truncate(key); self.header.max_key = key as isize -1; Ok(()) } pub fn iter<'a>(&'a self, from_key: u32, to_key: u32) -> Box + 'a > { let itr = MemIndexIterator { key: from_key, to_key, index: self }; Box::new(itr) } pub fn serialize_to_path(&self, filepath: &std::path::Path) -> Result<(), Error> { let tmp_buf = rmp_serde::to_vec(&self)?; let cbuf = zstd::bulk::compress(&tmp_buf, 3)?; let mut f = std::fs::OpenOptions::new() .write(true) .create(true) .truncate(true) .open(filepath)?; let cap_buf = tmp_buf.len().to_le_bytes(); f.write_all(&cap_buf)?; f.write_all(&cbuf)?; f.sync_data()?; Ok(()) } pub fn deserialize_from_path(filepath: &std::path::Path) -> Result<(usize, usize, MemIndex), Error> { let mut b = Vec::new(); utils::load_file(filepath, &mut b)?; let cap = usize::from_le_bytes([b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]]); let buf = zstd::bulk::decompress(&b[8..], cap)?; let index = rmp_serde::from_slice(&buf)?; Ok((cap, b.len(), index)) } } pub struct MemIndexIterator<'a> { index: &'a MemIndex, key: u32, to_key: u32 } impl<'a> MemIndexIterator<'a> { pub fn new(from_key: u32, to_key: u32, index: &'a MemIndex) -> Self { MemIndexIterator { index, key: from_key, to_key, } } } impl<'a> Iterator for MemIndexIterator<'a> { type Item = (u32, &'a Value); fn next(&mut self) -> Option { if self.to_key > 0 && self.key >= self.to_key { return None; } let kmap_index = self.key/self.index.header.pps as u32; if kmap_index as usize >= self.index.key_map().slot_map.len() { return None; } let slot_map = &self.index.key_map().slot_map[kmap_index as usize]; let ret = match slot_map { Some(map) => { let slot_index = (self.key as usize) %self.index.header.pps; if slot_index >= map.len() { return None; } let val = &map[slot_index]; if val.is_allocated() { Some((self.key, val)) }else{ None } }, None => None }; self.key += 1; ret } } ================================================ FILE: crates/mojokv/src/index/mod.rs ================================================ pub mod mem; use std::collections::HashSet; use crate::Error; use crate::value::Value; use serde::{Serialize, Deserialize}; pub const MOJO_INDEX_MAGIC: &'static str = "mojo_index"; #[derive(Debug, Serialize, Deserialize)] pub struct IndexHeader { pub magic: String, pub format_ver: u32, pub min_ver: u32, pub max_ver: u32, pub vset: HashSet, pub active_ver: u32, pub max_key: isize, pub pps: usize, } impl IndexHeader { pub fn new(pps: usize) -> Self { let mut vset = HashSet::new(); vset.insert(1); IndexHeader { magic: MOJO_INDEX_MAGIC.to_owned(), format_ver: 1, min_ver: 1, max_ver: 1, vset, active_ver: 1, pps, max_key: -1, } } } pub trait Index { fn put(&mut self, key: u32, off: u32) -> Result<(), Error>; fn get(&self, key: u32) -> Result, Error>; fn truncate(&mut self, key: u32) -> Result<(), Error>; } pub trait IndexSerde { fn serialize(idx: &I, w: &mut W) -> Result<(), Error>; fn deserialize(idx: &I, r: &mut R) -> Result; } ================================================ FILE: crates/mojokv/src/keymap.rs ================================================ use std::collections::HashSet; use crate::value::{Value, Slot}; use serde::{Serialize, Deserialize}; #[derive(Serialize, Deserialize)] pub struct KeyMap { pub slot_map: Vec, pps: usize } impl KeyMap { //TODO: remove this flag pub fn new(pps: usize) -> Self { KeyMap { slot_map: Vec::new(), pps, } } fn alloc_value_arr(pps: usize) -> Vec { let v = vec![Value::new(); pps]; v } pub fn get_min_max_ver(&self) -> (u32, u32, HashSet) { let mut set = HashSet::new(); let (mut min_ver, mut max_ver) = (u32::MAX,0); for slot in self.slot_map.iter() { if let Some(slot) = slot { for val in slot.iter() { let v = val.get_ver(); if v == 0 { break; } set.insert(v); min_ver = min_ver.min(v); max_ver = max_ver.max(v); } } } (min_ver, max_ver, set) } pub fn put(&mut self, key: u32, val: Value) { let slot = (key as usize)/self.pps; if slot >= self.slot_map.len() { log::debug!("KeyMap put key={}, value= {:?} slot={} kmaplen={}", key, val, slot, self.slot_map.len()); for _ in 0..(slot - self.slot_map.len() + 1) { self.slot_map.push(None); } } let val_arr = self.slot_map[slot].get_or_insert_with(||{ KeyMap::alloc_value_arr(self.pps) }); let slot_key = key % (self.pps as u32); log::debug!("KeyMap put slot_key={}", slot_key); val_arr[slot_key as usize] = val; } pub fn get(&self, key: u32) -> Option<&Value> { let slot = key/self.pps as u32; log::debug!("KeyMap get key={}, slot={}, kmaplen={}", key, slot, self.slot_map.len()); if slot as usize >= self.slot_map.len() { return None; } self.slot_map[slot as usize].as_ref().map(|val_arr|{ let slot_key = key % self.pps as u32; &val_arr[slot_key as usize] }) } pub fn truncate(&mut self, key: u32) { let slot = key/self.pps as u32; self.slot_map.truncate((slot+1) as usize); let slot_key = key % (self.pps as u32); if let Some(slot_vec) = self.slot_map[slot as usize].as_mut() { for i in slot_key as usize ..slot_vec.len() { slot_vec[i].deallocate(); } } } } ================================================ FILE: crates/mojokv/src/lib.rs ================================================ //#![feature(write_all_vectored)] pub mod index; mod bucket; mod error; mod value; mod state; mod keymap; mod utils; mod store; mod bmap; pub use error::Error; pub use bucket::Bucket; pub use bmap::BucketMap; pub use keymap::KeyMap; pub use value::{Value, Slot}; pub use store::{Store, BucketOpenMode}; //TODO: Pass pps from single place ================================================ FILE: crates/mojokv/src/state.rs ================================================ use crate::Error; use mojoio::nix::NixFile; use crate::utils; use std::sync::Arc; use parking_lot::RwLock; use serde::{Serialize, Deserialize}; #[derive(Debug, Serialize, Deserialize)] pub struct StateInner { pub format_ver: u32, pub min_ver: u32, pub max_ver: u32, pub active_ver: u32, pub pps: u32, pub page_sz: u32, pub file_header_len: u32, pub file_page_sz: u32, //TODO: add timestamp } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct State { inner: Arc>, #[serde(skip)] pub commit_lock: Arc>, } impl State { pub fn new(page_sz: u32, pps: u32) -> Self { let inner = StateInner { format_ver: 1, min_ver: 1, max_ver: 1, active_ver: 1, pps, page_sz, file_header_len: NixFile::header_len() as u32, file_page_sz: page_sz + NixFile::header_len() as u32, }; let state = State { inner: Arc::new(RwLock::new(inner)), commit_lock: Arc::new(RwLock::new(false)), }; state } pub fn format_ver(&self) -> u32 { let inner = self.inner.read(); inner.format_ver } pub fn active_ver(&self) -> u32 { let inner = self.inner.read(); inner.active_ver } pub fn page_size(&self) -> u32 { let inner = self.inner.read(); inner.page_sz } pub fn file_page_sz(&self) -> u32 { let inner = self.inner.read(); inner.file_page_sz } pub fn pps(&self) -> u32 { let inner = self.inner.read(); inner.pps } pub fn min_ver(&self) -> u32 { let inner = self.inner.read(); inner.min_ver } pub fn max_ver(&self) -> u32 { let inner = self.inner.read(); inner.max_ver } pub fn advance_ver(&self) -> u32 { let mut inner = self.inner.write(); inner.active_ver += 1; inner.max_ver = inner.max_ver.max(inner.active_ver); inner.active_ver } pub fn serialize_to_path(&self, filepath: &std::path::Path) -> Result<(), Error> { let buf = rmp_serde::to_vec_named(&self)?; utils::write_file(filepath, &buf)?; Ok(()) } pub fn deserialize_from_path(filepath: &std::path::Path) -> Result { let mut buf = Vec::new(); utils::load_file(filepath, &mut buf)?; let state = rmp_serde::from_slice(&buf)?; Ok(state) } } ================================================ FILE: crates/mojokv/src/store.rs ================================================ use std::path::{Path, PathBuf}; use std::sync::Arc; use crate::{Error, utils}; use crate::state::State; use crate::bucket::Bucket; use crate::bmap::BucketMap; use crate::index::mem::MemIndex; use parking_lot::RwLock; use fslock::LockFile; struct StoreInner { root_path: PathBuf, state: State, is_write: bool, bmap: BucketMap, } pub struct Store { inner: Arc>, } impl Store { pub fn exists(&self, name: &str) -> bool { let inner = self.inner.read(); inner.bmap.exists(name) } pub fn open(&self, name: &str, mode: BucketOpenMode) -> Result { let mut inner = self.inner.write(); log::debug!("store bucket open name={} mode writable={} store is write: {}", name, mode.is_write(), inner.is_write); if !inner.is_write && mode.is_write() { return Err(Error::StoreNotWritableErr); } let mut b = match inner.bmap.get(name) { Some(v) => { log::debug!("Bucket name={} exists at ver={}", name, v); Bucket::load(&inner.root_path, name, inner.state.clone(), inner.bmap.clone(), v)? }, None => { log::debug!("Bucket name={} does not exists", name); if !inner.is_write { return Err(Error::StoreNotWritableErr); } Bucket::new(&inner.root_path, name, inner.state.clone(), inner.bmap.clone())? } }; if inner.is_write && mode.is_write() { log::debug!("setting bucket={} to writable", name); b.set_writable(); b.sync()?; } if mode.is_write() { inner.sync_bmap()?; } Ok(b) } pub fn delete(&self, name: &str) -> Result<(), Error> { let mut inner = self.inner.write(); let aver = inner.state.active_ver(); inner.bmap.delete(&inner.root_path, name, aver)?; inner.sync_bmap() } pub fn commit(&self) -> Result { let mut inner = self.inner.write(); log::debug!("committing store ver={}", inner.state.active_ver()); let _ = inner.state.commit_lock.write(); log::debug!("about to acquire commit file lock ver={}", inner.state.active_ver()); let mut commit_lock_file = Self::create_lock_file(&inner.root_path)?; if !commit_lock_file.try_lock_with_pid()? { return Err(Error::CommitLockedErr); } let new_ver = inner.state.advance_ver(); inner.sync_state()?; inner.sync_bmap()?; log::debug!("committing store done"); Ok(new_ver) } pub fn active_ver(&self) -> u32 { let inner = self.inner.read(); inner.state.active_ver() } pub fn load_state(rootpath: &Path) -> Result { let state_path = rootpath.join("mojo.state"); log::debug!("loading state from {:?}", state_path); let state = State::deserialize_from_path(&state_path)?; Ok(state) } pub fn readonly(root_path: &Path, ver: u32) -> Result { log::debug!("opening store in readonly mode at ver={}", ver); let state = Self::load_state(root_path)?; Self::load_store(root_path, state, ver) } pub fn writable(rootpath: &Path, create: bool, page_sz: Option, pps: Option) -> Result { let init_path = rootpath.join("mojo.init"); if create && (page_sz.is_none() || pps.is_none()) { log::debug!("Missing mandatory params page_sz:{:?} pps:{:?}", page_sz, pps); return Err(Error::MissingArgsErr); } let store = if !init_path.exists() { if !create { return Err(Error::StoreNotFoundErr); } log::debug!("Store does not exists. Initing now"); let mut store = Store::new(rootpath, page_sz.unwrap(), pps.unwrap())?; store.init()?; log::debug!("Store init successfull"); store }else{ let state = Self::load_state(rootpath)?; let aver = state.active_ver(); Self::load_store(rootpath, state, aver)? }; { let mut inner = store.inner.write(); inner.is_write = true; } Ok(store) } fn load_store(root_path: &Path, state: State, ver: u32) -> Result { log::debug!("loading store at ver={}", ver); let bmap = BucketMap::load(root_path, ver)?; let inner = StoreInner { root_path: root_path.to_owned(), state, is_write: false, bmap, }; let store = Store {inner: Arc::new(RwLock::new(inner))}; Ok(store) } pub fn get_index(&self, name: &str) -> Result, Error> { let inner = self.inner.read(); match inner.bmap.get(name) { Some(v) => { let ret = Bucket::load_index(&inner.root_path, name, v)?; Ok(Some(ret)) }, None => { log::debug!("Bucket name={} does not exists", name); return Ok(None) } } } fn new(root_path: &Path, page_sz: u32, pps: u32) -> Result { let state = State::new(page_sz, pps); let inner = StoreInner { root_path: root_path.to_owned(), state, is_write: false, bmap: BucketMap::default(), }; let store = Store { inner: Arc::new(RwLock::new(inner)), }; Ok(store) } fn init(&mut self) -> Result<(), Error> { let mut inner = self.inner.write(); std::fs::create_dir_all(&inner.root_path)?; inner.sync()?; let touch_file = inner.root_path.join("mojo.init"); utils::touch_file(&touch_file)?; Ok(()) } fn create_lock_file(root_path: &Path) -> Result { let lock_path = root_path.join("mojo.lock"); log::debug!("creating lock file: {:?}", lock_path); Ok(LockFile::open(&lock_path)?) } } impl StoreInner { fn sync(&mut self) -> Result<(), Error> { self.sync_state()?; self.sync_bmap()?; Ok(()) } fn sync_bmap(&mut self) -> Result<(), Error> { log::debug!("syncing bmap at ver={}", self.state.active_ver()); let bmap_path = self.root_path.join(&format!("mojo.bmap.{}", self.state.active_ver())); self.bmap.serialize_to_path(&bmap_path)?; Ok(()) } fn sync_state(&mut self) -> Result<(), Error> { let file_path = self.root_path.join("mojo.state"); log::debug!("syncing state ver={} {:?}", self.state.active_ver(), file_path); self.state.serialize_to_path(&file_path)?; log::debug!("syncing state done"); Ok(()) } } #[derive(Clone, Debug, PartialEq)] pub enum BucketOpenMode { Read, Write, } impl BucketOpenMode { pub fn is_write(&self) -> bool { *self == Self::Write } } ================================================ FILE: crates/mojokv/src/utils.rs ================================================ use std::path::Path; use std::io::{Read, Write}; use crate::Error; pub fn load_file(path: &Path, buf: &mut Vec) -> Result<(), Error> { let mut f = std::fs::OpenOptions::new().read(true).open(path)?; f.read_to_end(buf)?; Ok(()) } pub fn write_file(path: &Path, buf: &[u8]) -> Result<(), Error> { let mut f = std::fs::OpenOptions::new() .write(true) .create(true) .truncate(true) .open(path)?; f.write_all(buf)?; f.sync_data()?; Ok(()) } pub fn touch_file(path: &Path) -> Result<(), Error> { log::debug!("creating init file: {:?}", path); let _ = std::fs::File::create(path)?; log::debug!("creating init file done"); Ok(()) } /* pub fn read_le_u32(r: &mut R) -> Result { let mut buf = [0u8; 4]; r.read_exact(&mut buf)?; Ok(u32::from_le_bytes(buf)) } pub fn read_le_isize(r: &mut R) -> Result { let mut buf = [0u8; std::mem::size_of::()]; r.read_exact(&mut buf)?; Ok(isize::from_le_bytes(buf)) } pub fn read_le_usize(r: &mut R) -> Result { let mut buf = [0u8; std::mem::size_of::()]; r.read_exact(&mut buf)?; Ok(usize::from_le_bytes(buf)) } */ ================================================ FILE: crates/mojokv/src/value.rs ================================================ use modular_bitfield::{bitfield, specifiers::*}; use serde::{Serialize, Deserialize}; use serde::de::Visitor; pub type Slot = Option>; #[derive(Clone, Copy)] #[bitfield] pub struct Value { off: B32, ver: B24 } impl std::fmt::Debug for Value { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { f.write_fmt(format_args!("o={},v={}", self.off(), self.ver())) } } impl Value { pub fn is_allocated(&self) -> bool { self.ver() > 0 } pub fn deallocate(&mut self) { self.set_off(0); self.set_ver(0); } pub fn put_off(&mut self, off: u32) { self.set_off(off); } pub fn get_off(&self) -> u32 { self.off() } pub fn put_ver(&mut self, v: u32) { self.set_ver(v); } pub fn get_ver(&self) -> u32 { self.ver() as u32 } } impl Serialize for Value { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer { serializer.serialize_bytes(&self.into_bytes()) } } struct ValueVisitor {} impl<'de> Visitor<'de> for ValueVisitor { type Value = Value; fn visit_bytes(self, v: &[u8]) -> Result where E: serde::de::Error, { if v.len() != 7 { return Err(serde::de::Error::invalid_length(v.len(), &self)); } Ok(Value::from_bytes([v[0], v[1], v[2], v[3], v[4], v[5], v[6]])) } fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { formatter.write_str("byte array of 7 bytes") } } impl<'de> Deserialize<'de> for Value { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de> { deserializer.deserialize_bytes(ValueVisitor{}) } } /* pub fn serialize_valuearr(val_opt: &Option>, w: &mut W) -> Result<(), Error> { match val_opt { Some(val) => { let n_items = val.len() as u32; w.write_all(&n_items.to_le_bytes())?; for elem in val.iter() { rmp_serde::encode::write(w, elem)?; } }, None => { w.write_all(&0u32.to_le_bytes())?; } } Ok(()) } pub fn deserialize_valuearr(r: &mut R, pps: usize) -> Result>, Error> { let mut tmp_buf = [0u8; 4]; r.read_exact(&mut tmp_buf)?; let count = u32::from_le_bytes(tmp_buf); if count == 0 { Ok(None) }else{ if count as usize != pps { return Err(Error::UnknownStr("Less number of values than expected".to_owned())); } let mut tmp_vec = Vec::new(); for _ in 0..count { let val = rmp_serde::decode::from_read(r)?; tmp_vec.push(val); } Ok(Some(tmp_vec)) } } */ ================================================ FILE: docs/design.md ================================================ - [Design choices](#design-choices) - [Design](#design) - [Layers](#layers) - [MojoKV](#mojokv) - [Index](#index) - [MojoFS](#mojofs) # Design choices Mojofs is a versioning file-system for sqlite database. It is a completely tailor made for sqlite and is not to be used as a general purpose file-system. This allows the fs to make certain assumptions which in turn drives design choices. Following are the assumptions about any sqlite fs, which I think are reasonable: * Small number of files * Large files * Very flat folder structure * Read/write in terms of fixed page size (or there multiple) dominates compared to any random offset. This influences the way we store namespace (i.e. folder/file names), the index (i.e. mapping between pages and offsets) and internal decoupling (e.g mojofs itself doesn't do much but instead uses mojokv as storage layer) # Design ## Layers The layers of mojofs are: Sqlite -> Mojofs extension lib -> mojofs -> mojokv The sqlite could be the cli (i.e. sqlite3 binary) or sqlite C API or any of its bindings. The fs is developed as both an sqlite extension and vfs, which is compiled down to a shared library. This shared library is loaded as extension which then registers the VFS=mojo with the sqlite. Mojofs implements the sqlite's [VFS interface](https://www.sqlite.org/vfs.html) which asks file system like apis to be implemented. The fs uses mojokv (Mojo Key-Value store) to actually store the data. The KV has a notion of 'bucket' which the fs creates for each new file asked by sqlite. All the buckets i.e files taken together are versioned. The MojoKV is tailor made for the needs of sqlite and as such is not a general purpose key-value store. ## MojoKV MojoKV is the core storage layer which handles the index and actual data files. The KV has a notion of bucket, on which read/write happens. Each bucket has an index which is a mapping of (Page No) => (New Page No, version) ### Index The write api at [File IO methods](https://www.sqlite.org/c3ref/io_methods.html) looks like below: ``` int (*xWrite)(sqlite3_file*, const void*, int iAmt, sqlite3_int64 iOfst); ``` The `sqlite3_file*` is the file handle, `const void*, int iAmt` are the pointer to data and its length. The `iOfst` is the offset at which data needs to be written. Mojofs has versions/snapshots and the sqlite API does not know anything about it. This means that when write is called, mojofs needs to know to which version the data should be written. The sqlite divides the file into pages. This is configurable when db is created for the first time, but assume 4KB for this document. We can logically think of a file and its versions as below: | |1|2|3|4| |------|-|-|-|-| |Page 0|1|2|3|4| |Page 1|1|2| |4| |Page 2|1| | | | |Page 3|1| |3|4| |Page 4|1| |3| | The file above has 5 pages of 4KB each and are depicted as rows. The columns are the version numbers. The value for page 0 and version 2 is 2. This means that the page 0 was modified in version 2. For page 2 and version 2 the value is empty and it means the page was not modified from previous version. When the file is created new, naturally all the pages will be marked as version 1. When page 2 and version=3 needs to be read, it actually needs to be fetch the page from version=1. Essentially the page no should map to a certain location on disk. For simple, unversioned file, the page number translates to an offset in file i.e. page no x page size. But for versioned file, the mapping of between the tuple (Pg No, Version) => \ is needed. This nicely yields itself to be stored in a key-value store. The actual data-structure used is `Vec>>`. The entire bucket/file is split into slots and the inner/nested vector (`Vec`) is called slot map. Each slot corresponds to a page of sqlite. A slot is a tuple of (physical page number, version). There is a maximum number of slots controlled by a tunable called `pages per slot`. The version 3 for the file in the above example is represented as below (with `pages per slot` = 2) Assume page size=4KB. ``` Slot 0: index 0 = (0,3) index 1 = (1,2) Slot 1: index 0 = (2,1) index 1 = (2,3) Slot 2: index 0 = (2,3) ``` The outer vector consists of 3 vectors, one each corresponding to one slot. The key (which is a page no) is converted to slot vector and index within each slot vector. So the writing and reading from from index is O(1). The value is a tuple (physical page no in a version file, version). Since for each version a separate file is created, the page number in the tuple is the physical page no and version in which the page was last modified/created. ## MojoFS Each sqlite database is created as a directory instead of a single file. For each file name = F and version = V the mojokv stores the file with name F.V. The filename (=F) is chosen by the fs. So for a given sqlite db 'a.db' being created for the first time the fs will create the following on disk: ``` sudeep@local-3 mojo % tree ./a.db ./a.db ├── a.db_d.1 ├── a.db_i.1 ├── mojo.bmap.1 ├── mojo.init └── mojo.state ... ``` So the fs creates the dir = `a.db`. The sqlite issues open call for the main db file i.e. test.db. The fs adds `_d` (d=data) to the name creates `a.db_d.1`. The `.1` is the version. The `a.db_i.1` is the index file, which is internal to the kv. This has the mapping of (page no, version) => Physical offset. When a version 2 is created it will create a file: `a.db_d.2`. This file will contain only those pages which were modified in that version. As a result it will also create the index file `a.db_i.2`. The `mojo.*` files are files created/for the mojokv: `mojo.init` => Presence of this file indicates that filesystem was properly initialized. `mojo.state` => It stores the current state of the filesystem, like how many versions, current version, etc `mojo.bmap.1` => Bucket map. This stores namespace of the mojokv. ================================================ FILE: docs/source.md ================================================ Whether you want to just read or contribute to mojo, this document describes the code to get you started. *__Note__*: This is not a design document. ## Bird's eye view * All the rust code is under `crates` folder * All the docs are under `docs` folder * The `sqlite-ext` folder has C code which is compiled down to shared lib * The `test-scripts` has various assorted test scripts which includes perf & black-box test ## Crates ### mojokv This is the KV which powers the mojofs. * `store.rs` has the main store object. Buckets are "opened" using a store object * `bucket.rs` has the bucket object. A bucket has get & put methods. Each bucket has an index. * `index/mem.rs` has the `MemIndex` which is in-memory index which has the mapping `offset -> (physical offset, version)`. Each index has KeyMap. * `keymap.rs` The index is split into slots and a vector such slots are wrapped in KeyMap. * `state.rs` has the state object which reflects the current state of the kv ### mojoio Abstracts out the notion of file. This is the code which does the actual IO. It will have different implementations including remote KV store. * `nix.rs` implements unix based file ### mojofs Mojofs is the filesystem which is powered by mojokv. Each user file in fs maps to a bucket in mojokv. * `vfs.rs` has FS like operations like `open`, `delete`, `access`, etc * `kvfile.rs` has file like object which is implemented using mojokv, hence the name. * `native_file.rs` is the regular passthrough file object (uses std read/write) * `vfsfile.rs` has the object VFSFile which either is a kvfile or nativefile. At present everything is kvfile. The native file will be used for transient/temp files which does not need versioning. This is an optimization. * `lib.rs` has vfs functions needed by sqlite e.g. `fn mojo_read(sfile: *mut sqlite3_file, ptr: *mut c_void, n: i32, off: i64)` ### mojo-cli This a CLI utility to control the mojokv. Each command in mojo-cli maps to a `*.rs` file. Example: `commit` command can be found in `commit.rs` ================================================ FILE: docs/user-guide.md ================================================ This document assumes that you have followed the build instructions and/or have the libmojo shared library. - [Opening/Creating the database](#openingcreating-the-database) - [Committing database](#committing-database) - [Committing MojoFS vs Committing Database](#committing-mojofs-vs-committing-database) - [Reading old version](#reading-old-version) ## Opening/Creating the database Mojofs is compiled down to a shared library. It should be loaded as extension in sqlite before it can be used. ```shell sqlite3 < Name of the database * `vfs=mojo` => Name of the MojoFS * `pagesz=4096` => Page size used by the fs. This should same as the page size in the pragma `pragma page_size = 4096` Once set, the page size cannot be changed. When the database is created for the first time, it starts with version=1. Version numbers are ever incrementing and the highest version number is writable and old versions are read-only. The fs has to be commited to make the current active version as read-only. ## Committing database The `mojo-cli` is the tool for administration of the mojofs. To commit the database `a.db`: ```shell mojo-cli ./a.db commit ``` Commit advances version number by 1. So if current active version=1 then after committing, a new version=2 will be created and version=1 will be read-only now. Committing FS is really a cheap operation. It only manipulates the metadata of the FS and no data movement is involved. ## Committing MojoFS vs Committing Database Committing the fs is different than committing the database. You can continue to use the database in the active version as long as you want. This means multiple transactions can be initiated with all the DB commits and rollbacks, all in a single version. Committing the database makes sqlite issue `fsync()` call which makes the data written to disk durable. It is recommended to commit the fs when databases is just committed/rolled-backed. The FS is not aware any uncommitted transactions and committing the fs midway can cause undefined behaviour. ## Reading old version Pass the `ver=` and `mode=ro` to open it in readonly mode: ``` .open 'file:a.db?vfs=mojo&pagesz=4096&ver=2&mode=ro' ``` ================================================ FILE: meson.build ================================================ project('mojo', 'c', default_options: ['c_std=c11'], version: '0.1.0') compiler = meson.get_compiler('c') mojokv_lib_path = meson.source_root() + '/target/debug' if get_option('buildtype') == 'release' mojokv_lib_path = meson.source_root() + '/target/release' endif mojofs_lib = compiler.find_library('mojofs', dirs: [mojokv_lib_path]) mojofs_dep = declare_dependency(dependencies: [mojofs_lib]) sqlite_dep = dependency('sqlite3') shared_library('mojo', name_prefix: '', sources: ['sqlite-ext/mojo.c'], include_directories: ['sqlite-ext'], dependencies: [mojofs_dep, sqlite_dep]) ================================================ FILE: sqlite-ext/mojo.c ================================================ #include #include SQLITE_EXTENSION_INIT1 #include #include #include #include #include #include #include #include #include #include #include /* ** The maximum pathname length supported by this VFS. */ #define MAXPATHNAME 512 /* ** Argument zPath points to a nul-terminated string containing a file path. ** If zPath is an absolute path, then it is copied as is into the output ** buffer. Otherwise, if it is a relative path, then the equivalent full ** path is written to the output buffer. ** ** This function assumes that paths are UNIX style. Specifically, that: ** ** 1. Path components are separated by a '/'. and ** 2. Full paths begin with a '/' character. */ static int mojo_fullpath_name( sqlite3_vfs *pVfs, /* VFS */ const char *zPath, /* Input path (possibly a relative path) */ int nPathOut, /* Size of output buffer in bytes */ char *zPathOut /* Pointer to output buffer */ ){ char zDir[MAXPATHNAME+1]; if( zPath[0]=='/' ){ zDir[0] = '\0'; }else{ if( getcwd(zDir, sizeof(zDir))==0 ) return SQLITE_IOERR; } zDir[MAXPATHNAME] = '\0'; sqlite3_snprintf(nPathOut, zPathOut, "%s/%s", zDir, zPath); zPathOut[nPathOut-1] = '\0'; return SQLITE_OK; } int sqlite3_mojo_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi){ int rc = SQLITE_OK; SQLITE_EXTENSION_INIT2(pApi); mojofs_init_log(); sqlite3_vfs *vfs = mojo_create(); vfs->xFullPathname = mojo_fullpath_name; rc = sqlite3_vfs_register(vfs, 0); //printf("extension mojo called with rc=%d\n", rc); fflush(stdout); return rc; } ================================================ FILE: sqlite-ext/mojofs.h ================================================ #ifndef __mojo_h #define __mojo_h #include typedef struct MojoFile { sqlite3_vfs base; void* custom_file; } MojoFile; sqlite3_vfs* mojo_create(); void mojofs_init_log(); #endif ================================================ FILE: test-scripts/commands.py ================================================ """Multiple commands for testing""" import subprocess import sqlite3 MOJOKV_CLI=None class TestConfig: """Config for test db""" def __init__(self, page_sz=4096, journal_mode="WAL", vac_mode="NONE"): self.page_sz = page_sz self.journal_mode = journal_mode self.vac_mode = vac_mode def __repr__(self): return f"page_sz={self.page_sz} journal={self.journal_mode}" def vacuum(cur): """Vacuum""" cur.execute("vacuum") def opendb(cfg, db_path, ver="1", mode=""): """Open sqlite db using cfg""" if mode == "": conn_str = f"file:{db_path}?vfs=mojo&ver={ver}&pagesz={cfg.page_sz}&pps=65536" else: conn_str = f"file:{db_path}?vfs=mojo&mode=ro&ver={ver}&pagesz={cfg.page_sz}&pps=65536" conn = sqlite3.Connection(conn_str, uri=True) if mode != "ro": conn.execute(f"PRAGMA page_size={cfg.page_sz}") conn.execute(f"PRAGMA journal_mode={cfg.journal_mode}") conn.execute(f"PRAGMA auto_vacuum={cfg.vac_mode}") return conn def mkdir(dir_path): """"Make dir""" subprocess.run(["mkdir", "-p", dir_path], check=True, capture_output=True) def commit_version(dbpath): '''commit_version commits version for given dbpath''' subprocess.run([MOJOKV_CLI, dbpath, "commit"], check=True, capture_output=True) def create_table_person(cur): """Create table Person""" cur.execute("""create table if not exists person( name text primary key, age integer, id integer )""") cur.execute("create index if not exists person_idx_1 on person(id)") def get_row_count(cur, table): """Get row count of table""" row = cur.execute(f"select count(*) from {table}").fetchone() if row is None or row[0] is None: return 0 return int(row[0]) def table_person_count(conn): """Get row count of Person""" cur = conn.cursor() row = cur.execute("select count(*) from person").fetchone() return int(row[0]) def get_max_id_person(cur): """get max id""" row = cur.execute("select max(id) from person").fetchone() if row is None or row[0] is None: return 0 return int(row[0]) def insert_table_person(cur, count): """Insert into person table""" max_id = get_max_id_person(cur) for n in range(max_id+1, max_id+count+1): name = f"name-{n}" cur.execute("insert into person(name,age,id) values(?,?,?)", [name,n,n]) def delete_table_person(cur, from_id, to_id): """Delete from person table""" cur.execute(f"delete from person where id>={from_id} and id <= {to_id}") def drop_table_person(cur): """"Drop table person""" cur.execute("drop table person") def copy_table_person(cur, new_table): """"Copy table from person table""" cur.execute(f"create table if not exists {new_table} as select name,age,id from person") cur.execute(f"create index if not exists {new_table}_idx_1 on {new_table}(id)") ================================================ FILE: test-scripts/perftest.py ================================================ """" Perf test for mojo filesystem """ import sqlite3 import os import shutil import time import sys def rm_dir(path): '''Remove dir. Ignore file not found error''' try: if os.path.isdir(path): rm_fr(path) else: os.unlink(path) except FileNotFoundError: pass def rm_fr(path): '''Equivalent of rm -fr''' journal_path = path + "-journal" wal_path = path + "-wal" rm_dir(journal_path) rm_dir(wal_path) if not os.path.exists(path): return if os.path.isfile(path) or os.path.islink(path): os.unlink(path) else: shutil.rmtree(path) def _mojo_conn_str(db_path, ver="1", mode=""): """open database""" if mode == "": conn_str = f"file:{db_path}?vfs=mojo&ver={ver}&pagesz=4096&pps=65536" else: conn_str = f"file:{db_path}?vfs=mojo&mode=ro&ver={ver}&pagesz=4096&pps=65536" return conn_str def _std_conn_str(db_path, mode=""): if mode == "": conn_str = f"file:{db_path}" else: conn_str = f"file:{db_path}?mode=ro" return conn_str def open_db(db_path, vfs="mojo", ver="1", mode=""): """Open database""" if vfs == "mojo": conn_str=_mojo_conn_str(db_path, ver=ver, mode=mode) else: conn_str=_std_conn_str(db_path, mode=mode) conn = sqlite3.Connection(conn_str, uri=True) if mode != "ro": conn.execute("PRAGMA page_size=4096") return conn def create_table(conn): """create table""" conn.execute("create table test(s text primary key)") def insert_rows(conn, row_count, ver, suffix=""): """create table""" cur = conn.cursor() for i in range(row_count): val = f"{ver}-text-{i}{suffix}" cur.execute("insert into test values(?)", (val,)) def count_rows(conn, condition=""): """count rows""" cur = conn.cursor() if condition == "": row = cur.execute("select count(*) from test").fetchone() else: row = cur.execute(f"select count(*) from test where {condition}").fetchone() return int(row[0]) def select_rows(conn, condition=""): """select rows""" cur = conn.cursor() if condition == "": rows = cur.execute("select * from test") else: rows = cur.execute(f"select * from test where {condition}") count = 0 for _r in rows: count += 1 return count def update_text_rows(conn): """Update rows""" key = "odd-update-text" conn.execute("update test set s = ? where s like 'odd%'",(key,)) conn.commit() def load_extension(mojo_lib): """load_ext""" print("using libpath =", mojo_lib) con = sqlite3.connect(":memory:") # enable extension loading con.enable_load_extension(True) con.execute(f"select load_extension('{mojo_lib}')") con.execute("pragma page_size=4096") con.enable_load_extension(False) con.close() ROW_COUNT=10000000 def perf_insert(conn): """"Perf insert""" start = time.time() create_table(conn) insert_rows(conn, ROW_COUNT, "1") conn.commit() end = time.time() return end-start def perf_select(conn): """"Perf select""" start = time.time() count = select_rows(conn) end = time.time() print("select iter count:", count) return end-start def perf_count_rows(conn): """"Perf select""" start = time.time() count = count_rows(conn, "s like '%abc%'") print("row count:", count) end = time.time() return end-start def perf_update_rows(conn): """"Perf update rows""" start = time.time() update_text_rows(conn) end = time.time() return end-start if __name__ == '__main__': if len(sys.argv[1:]) < 1: print("Error: missing extension library path") sys.exit(1) ext_path = sys.argv[1] load_extension(ext_path) STD_DBPATH="./perf-std.db" MOJO_DBPATH="./perf-mojo.db" try: rm_fr(STD_DBPATH) rm_fr(MOJO_DBPATH) perf_list=[ ("insert", perf_insert), ("update rows", perf_update_rows), ("select", perf_select), ("row count", perf_count_rows), ] for desc, perf_fn in perf_list: print("Running perf for:", desc) e = [] for dbpath, vfs in [(STD_DBPATH, "std"), (MOJO_DBPATH, "mojo")]: dbconn = open_db(dbpath, vfs=vfs) elapsed = perf_fn(dbconn) e.append(elapsed) print(f"\tvfs={vfs} time elapsed (s):", elapsed) dbconn.close() ratio = round(e[1]/e[0], 3) print(f"\tMojo takes {ratio} times than std vfs") print("------------------------") except Exception as e: raise e finally: rm_fr(STD_DBPATH) rm_fr(MOJO_DBPATH) ================================================ FILE: test-scripts/test.sql ================================================ pragma page_size = 4096; .load ./build/libmojo .open 'file:a.db?vfs=mojo&pagesz=4096' create table if not exists test ( n int ); insert into test values (1); insert into test values (2); insert into test values (3); ================================================ FILE: test-scripts/test2.sql ================================================ pragma page_size = 4096; .load ./build/libmojo .open 'file:./testdbs/a_1.db?vfs=mojo&pagesz=4096&mode=ro&ver=2' select count(*), max(id) from person; ================================================ FILE: test-scripts/testdb.py ================================================ """" Tests for mojo filesystem """ import os import sys import unittest import sqlite3 import shutil import commands as c MOJOKV_CLI=None class TestConfig: '''Config for test db''' def __init__(self, page_sz=4096, journal_mode="WAL", vac_mode="NONE", use_tx=True): self.page_sz = page_sz self.journal_mode = journal_mode self.vac_mode = vac_mode self.use_tx = use_tx def __repr__(self): return f"page_sz={self.page_sz} journal={self.journal_mode}" def rm_dir(path): '''Remove dir. Ignore file not found error''' try: if os.path.isdir(path): rm_fr(path) else: os.unlink(path) except FileNotFoundError: pass def rm_fr(path): '''Equivalent of rm -fr''' journal_path = path + "-journal" wal_path = path + "-wal" rm_dir(journal_path) rm_dir(wal_path) if not os.path.exists(path): return if os.path.isfile(path) or os.path.islink(path): os.unlink(path) else: shutil.rmtree(path) class MojoWritableTest(unittest.TestCase): '''MojoWritableTest''' def __init__(self, cfg, dbpath, *args, **kargs): self.cfg = cfg self.db_conn = None self.db_path = dbpath super(MojoWritableTest, self).__init__(*args, **kargs) def setUp(self): rm_fr(self.db_path) def tearDown(self): if self.db_conn: self.db_conn.close() def _subtest_name(self, name): return f"{name}: {self.cfg} {self.db_path}" def begin(self, cur): """begin""" if self.cfg.use_tx: cur.execute("begin") def commit(self, cur): """commit""" if self.cfg.use_tx: cur.execute("commit") def rollback(self, cur): """rollback""" if self.cfg.use_tx: cur.execute("rollback") def test_db_use(self): '''Tests the general usage of the database ''' db_conn = c.opendb(self.cfg, self.db_path) ins_row_count = 100 with self.subTest(self._subtest_name("create table")): c.create_table_person(db_conn) with self.subTest(self._subtest_name("insert rows v1")): c.insert_table_person(db_conn, ins_row_count) db_conn.commit() count = c.table_person_count(db_conn) self.assertEqual(ins_row_count, count) db_conn.close() ### Commit ver=1 c.commit_version(self.db_path) db_conn = c.opendb(self.cfg, self.db_path) ### active ver=2 with self.subTest(self._subtest_name("insert rows v2")): self.assertEqual(ins_row_count, c.table_person_count(db_conn)) c.insert_table_person(db_conn, ins_row_count) db_conn.commit() self.assertEqual(ins_row_count*2, c.table_person_count(db_conn)) #db_conn.close() ### Commit ver=2 db_conn.close() c.commit_version(self.db_path) db_conn = c.opendb(self.cfg, self.db_path) ### active ver=3 with self.subTest(self._subtest_name("copy table")): self.assertEqual(ins_row_count*2, c.table_person_count(db_conn)) c.copy_table_person(db_conn, "person_2") db_conn.commit() self.assertEqual(ins_row_count*2, c.get_row_count(db_conn, "person_2")) with self.subTest(self._subtest_name("read v1")): db_v1 = c.opendb(self.cfg, self.db_path, mode="ro", ver="1") self.assertEqual(ins_row_count, c.table_person_count(db_v1)) db_v1.close() db_conn.close() ### Commit ver=3 c.commit_version(self.db_path) db_conn = c.opendb(self.cfg, self.db_path) ### active ver=4 with self.subTest(self._subtest_name("delete rows in v3")): c.delete_table_person(db_conn, 0, 10000) db_conn.commit() self.assertEqual(0, c.table_person_count(db_conn)) db_conn.close() ### Commit ver=4 c.commit_version(self.db_path) db_conn = c.opendb(self.cfg, self.db_path) ### active ver=5 with self.subTest(self._subtest_name("vacuum")): c.vacuum(db_conn) db_conn.commit() self.assertEqual(0, c.table_person_count(db_conn)) db_v3 = c.opendb(self.cfg, self.db_path, mode="ro", ver="4") self.assertEqual(0, c.table_person_count(db_v3)) db_v3.close() db_v2 = c.opendb(self.cfg, self.db_path, mode="ro", ver="2") self.assertEqual(ins_row_count*2, c.table_person_count(db_v2)) db_v2.close() db_conn.close() ### Commit ver=5 c.commit_version(self.db_path) def load_extension(mojo_lib): """load_extension""" print("using libpath =", mojo_lib) con = sqlite3.connect(":memory:") # enable extension loading con.enable_load_extension(True) con.execute(f"select load_extension('{mojo_lib}')") con.execute("pragma page_size=4096") con.enable_load_extension(False) con.close() def create_suite(full_mode): ''' Test suite for mojo ''' suite = unittest.TestSuite() if not full_mode: page_sizes = [4096] journal_modes = ["WAL"] vacuum_modes = ["INCREMENTAL"] use_tx = [False] else: page_sizes = [4096] journal_modes = ["OFF", "WAL", "MEMORY", "DELETE", "TRUNCATE", "PERSIST"] vacuum_modes = ["NONE", "FULL", "INCREMENTAL"] use_tx = [False] dbid = 0 for page_sz in page_sizes: for journal_mode in journal_modes: for vac_mode in vacuum_modes: for tx in use_tx: cfg = TestConfig(page_sz=page_sz, journal_mode=journal_mode, vac_mode=vac_mode, use_tx=tx) dbid += 1 dbpath = f"./testdbs/a_{dbid}.db" suite.addTest(MojoWritableTest(cfg, dbpath, 'test_db_use')) return suite if __name__ == '__main__': if len(sys.argv[1:]) < 1: print("Error: missing extension library path") sys.exit(1) if len(sys.argv[2:]) >= 1 and sys.argv[2] == "full": FULL = True else: FULL = False MOJOKV_CLI=os.getenv("MOJOKV_CLI") if not MOJOKV_CLI: MOJOKV_CLI="./build/mojo-cli" c.MOJOKV_CLI = MOJOKV_CLI ext_path = sys.argv[1] load_extension(ext_path) rm_fr("./testdbs/*") c.mkdir("./testdbs") runner = unittest.TextTestRunner(failfast=True) runner.run(create_suite(FULL)) #conn = sqlite3.connect("file:a.db?vfs=mojo&ver=1&pagesz=4096", uri=True) #conn = sqlite3.connect("a.db")