Repository: J-F-Liu/pom Branch: master Commit: 0fd011c736ea Files: 30 Total size: 104.0 KB Directory structure: gitextract_7hb7yep3/ ├── .editorconfig ├── .gitignore ├── .travis.yml ├── .vscode/ │ ├── launch.json │ └── tasks.json ├── Cargo.toml ├── LICENSE ├── README.md ├── assets/ │ └── data.json ├── benches/ │ ├── json.rs │ └── json_char.rs ├── doc/ │ └── article.md ├── examples/ │ ├── duration.rs │ ├── json.rs │ ├── json_char.rs │ ├── json_file.rs │ ├── simple.rs │ ├── test.json │ ├── utf8.rs │ ├── utf8_mixed.rs │ └── whitespace.rs ├── rustfmt.toml ├── src/ │ ├── char_class.rs │ ├── lib.rs │ ├── parser.rs │ ├── range.rs │ ├── result.rs │ ├── set.rs │ └── utf8.rs └── tests/ └── list.rs ================================================ FILE CONTENTS ================================================ ================================================ FILE: .editorconfig ================================================ root = true [*] end_of_line = lf charset = utf-8 trim_trailing_whitespace = true insert_final_newline = true indent_style = tab indent_size = 4 [*.md] trim_trailing_whitespace = false ================================================ FILE: .gitignore ================================================ target Cargo.lock ================================================ FILE: .travis.yml ================================================ language: rust sudo: false # run builds for all the trains (and more) rust: - stable # load travis-cargo before_script: - | pip install 'travis-cargo<0.2' --user && export PATH=$HOME/.local/bin:$PATH # the main build script: - | travis-cargo build && travis-cargo test env: global: # override the default `--features unstable` used for the nightly branch (optional) - TRAVIS_CARGO_NIGHTLY_FEATURE="" ================================================ FILE: .vscode/launch.json ================================================ { "version": "0.2.0", "configurations": [ { "type": "lldb", "request": "launch", "name": "Custom launch", "program": "./target/debug/examples/json" }, { "type": "lldb-mi", "request": "launch", "name": "Launch Program", "target": "./target/debug/examples/simple", "cwd": "${workspaceRoot}" } ] } ================================================ FILE: .vscode/tasks.json ================================================ { // See https://go.microsoft.com/fwlink/?LinkId=733558 // for the documentation about the tasks.json format "version": "0.1.0", "command": "cargo", "isShellCommand": true, "showOutput": "always", "echoCommand": true, "suppressTaskName": true, "tasks": [ { "taskName": "json", "args": [ "build", "--example", "json" ] }, { "taskName": "release", "args": [ "build", "--release" ] } ] } ================================================ FILE: Cargo.toml ================================================ [package] name = "pom" version = "3.4.1" edition = "2021" authors = ["Junfeng Liu "] homepage = "https://github.com/J-F-Liu/pom" documentation = "https://docs.rs/crate/pom/" repository = "https://github.com/J-F-Liu/pom.git" license = "MIT" readme = "README.md" description = "PEG parser combinators using operator overloading without macros." categories = ["parsing"] keywords = ["parser", "parser-combinators", "parsing", "PEG"] [badges] travis-ci = { repository = "J-F-Liu/pom" } [dependencies] bstr = { version = "1.1.0", features = [ ] } # Only uses one function, so no features needed. [features] default = ["utf8"] utf8 = [] trace = [] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2016 Junfeng Liu Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # pom [![Crates.io](https://img.shields.io/crates/v/pom.svg)](https://crates.io/crates/pom) [![Build Status](https://travis-ci.org/J-F-Liu/pom.png)](https://travis-ci.org/J-F-Liu/pom) [![Docs](https://docs.rs/pom/badge.svg)](https://docs.rs/pom) [![Discord](https://img.shields.io/badge/discord-pom-red.svg)](https://discord.gg/CVy85pg) PEG parser combinators created using operator overloading without macros. ## Document - [Tutorial](https://github.com/J-F-Liu/pom/blob/master/doc/article.md) - [API Reference](https://docs.rs/crate/pom/) - [Learning Parser Combinators With Rust](https://bodil.lol/parser-combinators/) - By Bodil Stokke ## What is PEG? PEG stands for parsing expression grammar, is a type of analytic formal grammar, i.e. it describes a formal language in terms of a set of rules for recognizing strings in the language. Unlike CFGs, PEGs cannot be ambiguous; if a string parses, it has exactly one valid parse tree. Each parsing function conceptually takes an input string as its argument, and yields one of the following results: - success, in which the function may optionally move forward or consume one or more characters of the input string supplied to it, or - failure, in which case no input is consumed. Read more on [Wikipedia](https://en.wikipedia.org/wiki/Parsing_expression_grammar). ## What is parser combinator? A parser combinator is a higher-order function that accepts several parsers as input and returns a new parser as its output. Parser combinators enable a recursive descent parsing strategy that facilitates modular piecewise construction and testing. Parsers built using combinators are straightforward to construct, readable, modular, well-structured and easily maintainable. With operator overloading, a parser combinator can take the form of an infix operator, used to glue different parsers to form a complete rule. Parser combinators thereby enable parsers to be defined in an embedded style, in code which is similar in structure to the rules of the formal grammar. And the code is easier to debug than macros. The main advantage is that you don't need to go through any kind of code generation step, you're always using the vanilla language underneath. Aside from build issues (and the usual issues around error messages and debuggability, which in fairness are about as bad with macros as with code generation), it's usually easier to freely intermix grammar expressions and plain code. ## List of predefined parsers and combinators | Basic Parsers | Description | |------------------|-----------------------------------------------------------------| | empty() | Always succeeds, consume no input. | | end() | Match end of input. | | any() | Match any symbol and return the symbol. | | sym(t) | Match a single terminal symbol _t_. | | seq(s) | Match sequence of symbols. | | list(p,s) | Match list of _p_, separated by _s_. | | one_of(set) | Success when current input symbol is one of the set. | | none_of(set) | Success when current input symbol is none of the set. | | is_a(predicate) | Success when predicate return true on current input symbol. | | not_a(predicate) | Success when predicate return false on current input symbol. | | take(n) | Read _n_ symbols. | | skip(n) | Skip _n_ symbols. | | call(pf) | Call a parser factory, can be used to create recursive parsers. | | Parser Combinators | Description | |--------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | p | q | Match p or q, return result of the first success. | | p + q | Match p and q, if both succeed return a pair of results. | | p - q | Match p and q, if both succeed return result of p. | | p \* q | Match p and q, if both succeed return result of q. | | p >> q | Parse p and get result P, then parse q and return result of q(P). | | -p | Success when p succeeds, doesn't consume input. | | !p | Success when p fails, doesn't consume input. | | p.opt() | Make parser optional. Returns an `Option`. | | p.repeat(m..n) | `p.repeat(0..)` repeat p zero or more times
`p.repeat(1..)` repeat p one or more times
`p.repeat(1..4)` match p at least 1 and at most 3 times
`p.repeat(5)` repeat p exactly 5 times | | p.map(f) | Convert parser result to desired value. | | p.convert(f) | Convert parser result to desired value, fails in case of conversion error. | | p.pos() | Get input position after matching p. | | p.collect() | Collect all matched input symbols. | | p.discard() | Discard parser output. | | p.name(\_) | Give parser a name to identify parsing errors.
If the `trace` feature is enabled then a basic trace for the parse and parse result is made to stderr. | | p.expect(\_) | Mark parser as expected, abort early when failed in ordered choice. | The choice of operators is established by their operator precedence, arity and "meaning". Use `*` to ignore the result of first operand on the start of an expression, `+` and `-` can fulfill the need on the rest of the expression. For example, `A * B * C - D + E - F` will return the results of C and E as a pair. ## Example code ```rust use pom::parser::*; let input = b"abcde"; let parser = sym(b'a') * none_of(b"AB") - sym(b'c') + seq(b"de"); let output = parser.parse(input); assert_eq!(output, Ok( (b'b', vec![b'd', b'e'].as_slice()) ) ); ``` ### Example JSON parser ```rust extern crate pom; use pom::parser::*; use pom::Parser; use std::collections::HashMap; use std::str::{self, FromStr}; #[derive(Debug, PartialEq)] pub enum JsonValue { Null, Bool(bool), Str(String), Num(f64), Array(Vec), Object(HashMap) } fn space() -> Parser { one_of(b" \t\r\n").repeat(0..).discard() } fn number() -> Parser { let integer = one_of(b"123456789") - one_of(b"0123456789").repeat(0..) | sym(b'0'); let frac = sym(b'.') + one_of(b"0123456789").repeat(1..); let exp = one_of(b"eE") + one_of(b"+-").opt() + one_of(b"0123456789").repeat(1..); let number = sym(b'-').opt() + integer + frac.opt() + exp.opt(); number.collect().convert(str::from_utf8).convert(|s|f64::from_str(&s)) } fn string() -> Parser { let special_char = sym(b'\\') | sym(b'/') | sym(b'"') | sym(b'b').map(|_|b'\x08') | sym(b'f').map(|_|b'\x0C') | sym(b'n').map(|_|b'\n') | sym(b'r').map(|_|b'\r') | sym(b't').map(|_|b'\t'); let escape_sequence = sym(b'\\') * special_char; let string = sym(b'"') * (none_of(b"\\\"") | escape_sequence).repeat(0..) - sym(b'"'); string.convert(String::from_utf8) } fn array() -> Parser> { let elems = list(call(value), sym(b',') * space()); sym(b'[') * space() * elems - sym(b']') } fn object() -> Parser> { let member = string() - space() - sym(b':') - space() + call(value); let members = list(member, sym(b',') * space()); let obj = sym(b'{') * space() * members - sym(b'}'); obj.map(|members|members.into_iter().collect::>()) } fn value() -> Parser { ( seq(b"null").map(|_|JsonValue::Null) | seq(b"true").map(|_|JsonValue::Bool(true)) | seq(b"false").map(|_|JsonValue::Bool(false)) | number().map(|num|JsonValue::Num(num)) | string().map(|text|JsonValue::Str(text)) | array().map(|arr|JsonValue::Array(arr)) | object().map(|obj|JsonValue::Object(obj)) ) - space() } pub fn json() -> Parser { space() * value() - end() } fn main() { let input = br#" { "Image": { "Width": 800, "Height": 600, "Title": "View from 15th Floor", "Thumbnail": { "Url": "http://www.example.com/image/481989943", "Height": 125, "Width": 100 }, "Animated" : false, "IDs": [116, 943, 234, 38793] } }"#; println!("{:?}", json().parse(input)); } ``` You can run this example with the following command: ``` cargo run --example json ``` ## Benchmark | Parser | Time to parse the same JSON file | |------------------------------------------------------|----------------------------------| | pom: json_byte | 621,319 ns/iter (+/- 20,318) | | pom: json_char | 627,110 ns/iter (+/- 11,463) | | [pest](https://github.com/dragostis/pest): json_char | 13,359 ns/iter (+/- 811) | ### Lifetimes and files String literals have a static lifetime so they can work with the static version of Parser imported from `pom::Parser`. Input read from a file has a shorter lifetime. In this case you should import `pom::parser::Parser` and declare lifetimes on your parser functions. So ```rust fn space() -> Parser { one_of(b" \t\r\n").repeat(0..).discard() } ``` would become ```rust fn space<'a>() -> Parser<'a, u8, ()> { one_of(b" \t\r\n").repeat(0..).discard() } ``` ================================================ FILE: assets/data.json ================================================ [ { "_id": "5741cfe6bf9f447a509a269e", "index": 0, "guid": "642f0c2a-3d87-43ac-8f82-25f004e0c96a", "isActive": false, "balance": "$3,666.68", "picture": "http://placehold.it/32x32", "age": 39, "eyeColor": "blue", "name": "Leonor Herman", "gender": "female", "company": "RODEOMAD", "email": "leonorherman@rodeomad.com", "phone": "+1 (848) 456-2962", "address": "450 Seeley Street, Iberia, North Dakota, 7859", "about": "Reprehenderit in anim laboris labore sint occaecat labore proident ipsum exercitation. Ut ea aliqua duis occaecat consectetur aliqua anim id. Dolor ea fugiat excepteur reprehenderit eiusmod enim non sit nisi. Mollit consequat anim mollit et excepteur qui laborum qui eiusmod. Qui ea amet incididunt cillum quis occaecat excepteur qui duis nisi. Dolore labore eu sunt consequat magna.\r\n", "registered": "2015-03-06T02:49:06 -02:00", "latitude": -29.402032, "longitude": 151.088135, "tags": [ "Lorem", "voluptate", "aute", "ullamco", "elit", "esse", "culpa" ], "friends": [ { "id": 0, "name": "Millicent Norman" }, { "id": 1, "name": "Vincent Cannon" }, { "id": 2, "name": "Gray Berry" } ], "greeting": "Hello, Leonor Herman! You have 4 unread messages.", "favoriteFruit": "apple" }, { "_id": "5741cfe69424f42d4493caa2", "index": 1, "guid": "40ec6b43-e6e6-44e1-92a8-dc80cd5d7179", "isActive": true, "balance": "$2,923.78", "picture": "http://placehold.it/32x32", "age": 36, "eyeColor": "blue", "name": "Barton Barnes", "gender": "male", "company": "BRAINQUIL", "email": "bartonbarnes@brainquil.com", "phone": "+1 (907) 553-3739", "address": "644 Falmouth Street, Sedley, Michigan, 5602", "about": "Et nulla laboris consectetur laborum labore. Officia dolor sint do amet excepteur dolore eiusmod. Occaecat pariatur sunt velit sunt ullamco labore commodo mollit sint dolore occaecat.\r\n", "registered": "2014-08-28T01:07:22 -03:00", "latitude": 14.056553, "longitude": -61.911624, "tags": [ "laboris", "sunt", "esse", "tempor", "pariatur", "occaecat", "et" ], "friends": [ { "id": 0, "name": "Tillman Mckay" }, { "id": 1, "name": "Rivera Berg" }, { "id": 2, "name": "Rosetta Erickson" } ], "greeting": "Hello, Barton Barnes! You have 2 unread messages.", "favoriteFruit": "banana" } ] ================================================ FILE: benches/json.rs ================================================ #![feature(test)] extern crate test; use self::test::Bencher; use std::fs::File; use std::io::Read; extern crate pom; #[path = "../examples/json.rs"] mod json; #[bench] fn json_byte(b: &mut Bencher) { let mut file = File::open("assets/data.json").unwrap(); let mut input = Vec::new(); file.read_to_end(&mut input).unwrap(); b.iter(|| { json::json().parse(&input).ok(); }); } ================================================ FILE: benches/json_char.rs ================================================ #![feature(test)] extern crate test; use self::test::Bencher; use std::fs::File; use std::io::Read; extern crate pom; #[path = "../examples/json_char.rs"] mod json; #[bench] fn json_char(b: &mut Bencher) { let mut file = File::open("assets/data.json").unwrap(); let mut input = String::new(); file.read_to_string(&mut input).unwrap(); let chars: Vec = input.chars().collect(); b.iter(|| { json::json().parse(&chars).ok(); }); } ================================================ FILE: doc/article.md ================================================ # PEG Parser Combinators Implemented in Rust This article introduces [pom](https://github.com/J-F-Liu/pom), a PEG parser combinator library implemented in Rust, using operator overloading without macros. ## Why Rust? ![Rust](rust.png) After I've learned C/C++ and C#, I found that choosing a new programming language can greatly affect a programmer's productivity. On one hand I keep sorting out new languages, there are hundreds of them, I examine and choose what I like best, my favorites are C#, Ruby, TypeScript and Rust. On the other hand I try to design a new language and implement a compiler by myself. I like the syntax provided by C#, but hate the huge .NET runtime. Dependency on CLR makes distribution of an application written in C# very hard. Compiling to native code is always what I longed for a programming language. In year 2003 I thought a compiler can get rid of garbage collector by generating free memory instructions in appropriate locations in the target program. But I didn't go deep into the design of the details of this mechanism, I decided to firstly write a working compiler, then improve the design of the language and implementation of the compiler bit by bit. The first stage of compilation is parsing. I tried some parser generators, but not satisfied with the result. Then I dig into the parsing theory, followed several books, implemented DFA, NFA, NFA to DFA conversion, LL(1), LR, LALR algorithms, then wrote a parser to parse BNF, EBNF or TBNF grammar file, and generate parser code corresponding to the grammar. The syntax/semantics analysis and code generation parts of a compiler are more difficult. I even tried to define a intermediate assembly language, at that time I didn't know LLVM. My effort of writing a compiler ceased for years, then Rust was born. At first glance, the Rust's syntax is a bit strange, why use `fn` instaed of `def`, why use `let mut` instead of `var`, I was not attracted by it. After read a publication on O'Reilly [*Why Rust?*](http://www.oreilly.com/programming/free/files/why-rust.pdf) I suddenly realized that this is language I'm trying to build, when you actually start using Rust you'll find that `fn` and `let mut` fits Rust's logic well. For me, **Rust is once a dream now a reality.** Rust has a steep learning curve, more challenging than any of the previous programming languages I learned. All this learning is worthwhile when you finally get your program working and polished. Object oriented class hierarchy is not good enough for code reuse, Rust's enum, tuple, struct and trait type system is a better solution. I still wondering whether the Rust compiler can be smart enough to elide all the lifetime parameters, they are mostly noise and obstacle when reading and writing programs. ## What is PEG? When I discovered [PEG](http://bford.info/packrat/), I knew that all my previous work on LALR can be thrown away. I rewrote my parser generator using and working with PEG. Using this parser generator I created a [YAML parser](https://www.codeproject.com/Articles/28720/YAML-Parser-in-C) and a [Lua Interpreter](https://www.codeproject.com/Articles/228212/Lua-Interpreter). [Parsing Expression Grammars](http://en.wikipedia.org/wiki/Parsing_expression_grammar) (PEGs) are an alternative to [Context-Free Grammars](http://en.wikipedia.org/wiki/Context-free_grammar) (CFGs) for formally specifying syntax. CFG describe a rule system to generate language strings while PEG describe a rule system to recognize language strings. ![CFG](cfg.png) ![PEG](peg.png) Unlike CFGs, PEGs cannot be ambiguous; if a string parses, it has exactly one valid parse tree. We normally specify our languages directly by how to recognize it, so PEG is both closer match to syntax practices and more powerful than nondeterministic CFG. ### Parsing expressions | Expr | Description | | ---------- | ----------------------------------- | | ε | the empty string | | a | terminal (a ∈ Σ) | | A | non-terminal (A ∈ N) | | e1 e2 | sequence | | e1 / e2 | prioritized choice | | e?, e*, e+ | optional, zero-or-more, one-or-more | | &e, !e | syntactic predicates | ## What is parser combinator? When I heard of Parsec in the Haskell world, I got the concept of parser combinator for my first time. A *parser* is a function which takes a *string* (a series of *symbols*) as input, and returns matching result as *output*. A *combinator* is a higher-order function (a "functional") which takes zero or more functions (each of the same type) as input and returns a new function of the same type as output. A *parser combinator* is a higher-order function which takes parsers as input and returns a new parser as output. Parser combinators allow you write grammar rules and create a parser directly in the host language, without a separated parser generation step, so the whole procedure is more fluent. ## How to implement parser combinators? I thought deeply about how to implement parser combinator using language constructs provided by Rust. In summary, there are four approaches: 1. Parser as closure ```rust pub fn empty() -> impl Fn(&mut Input) -> Result<()> { |_: &mut Input| Ok(()) } pub fn term(t: I) -> impl Fn(&mut Input) -> Result { ... } pub fn seq<'a, I>(tag: &'a [I]) -> impl Fn(&mut Input) -> Result<&'a [I]> { ... } ... // To create a parser for integer let parser = concatenate(optional(one_of("+-")), one_or_more(one_of("0123456789"))); ``` *Pros*: Less implementation code. *Cons*: Cannot overload operators, poor readability. 2. Parser as struct ```rust pub struct Parser { method: Box) -> Result>, } impl Parser { /// Create new parser. pub fn new

(parse: P) -> Parser where P: Fn(&mut Input) -> Result + 'static { Parser { method: Box::new(parse) } } /// Apply the parser to parse input. pub fn parse(&self, input: &mut Input) -> Result { (self.method)(input) } ... } pub fn empty() -> Parser { Parser::new(|_: &mut Input| Ok(())) } pub fn term(t: I) -> Parser { ... } ... impl Add> for Parser { type Output = Parser; fn add(self, other: Parser) -> Self::Output where I: 'static, O: 'static, U: 'static { Parser::new(move |input: &mut Input| { let start = input.position(); let result = self.parse(input) .and_then(|out1| other.parse(input).map(|out2| (out1, out2))); if result.is_err() { input.jump_to(start); } result }) } } ... // To create a parser for integer let parser = one_of("+-").opt() + one_of("0123456789").repeat(1..); ``` *Pros*: Can overload operators, elegant code. *Cons*: Depends on compiler's zero-cost abstractions to optimize runtime performance. Crate [pom](https://github.com/J-F-Liu/pom) is using this approach. 3. Parser as trait ```rust pub trait Parser { type I: ?Sized; type O; fn parse<'a>(&self, data: &'a Self::I) -> ParseResult<&'a Self::I, Self::O>; } pub trait ParserCombinator : Parser + Clone { fn then>(&self, p: P) -> ChainedParser { ChainedParser{first: self.clone(), second: p} } ... } pub fn opt(t: T) -> OptionParser { OptionParser{parser: t} } pub fn recursive Box>>(f: F) -> RecursiveParser { RecursiveParser{parser: Rc::new(f)} } ... pub struct ChainedParser { first: A, second: B, } ... impl, B: Parser> Parser for ChainedParser { type I = C; type O = (A::O,B::O); fn parse<'a>(&self, data: &'a Self::I) -> ParseResult<&'a Self::I, Self::O>{ match self.first.parse(data) { Ok((a, d2)) => match self.second.parse(d2) { Ok((b, remain)) => Ok(((a, b), remain)), Err(err) => Err(err) }, Err(err) => Err(err) } } } impl, B: ParserCombinator> Clone for ChainedParser { ... } ... ``` *Pros*: Can overload operators. *Cons*: Bloated code. Crate [peruse](https://github.com/DanSimon/peruse) is using this approach. 4. Parser as macro ```rust #[macro_export] macro_rules! do_parse ( (__impl $i:expr, $consumed:expr, ( $($rest:expr),* )) => ( $crate::IResult::Done($i, ( $($rest),* )) ); (__impl $i:expr, $consumed:expr, $e:ident >> $($rest:tt)*) => ( do_parse!(__impl $i, $consumed, call!($e) >> $($rest)*); ); (__impl $i:expr, $consumed:expr, $submac:ident!( $($args:tt)* ) >> $($rest:tt)*) => ( { match $submac!($i, $($args)*) { $crate::IResult::Error(e) => $crate::IResult::Error(e), $crate::IResult::Incomplete($crate::Needed::Unknown) => $crate::IResult::Incomplete($crate::Needed::Unknown), $crate::IResult::Incomplete($crate::Needed::Size(i)) => $crate::IResult::Incomplete($crate::Needed::Size($consumed + i)), $crate::IResult::Done(i,_) => { do_parse!(__impl i, $consumed + ($crate::InputLength::input_len(&($i)) - $crate::InputLength::input_len(&i)), $($rest)*) }, } } ); (__impl $i:expr, $consumed:expr, $field:ident : $e:ident >> $($rest:tt)*) => ( do_parse!(__impl $i, $consumed, $field: call!($e) >> $($rest)*); ); (__impl $i:expr, $consumed:expr, $field:ident : $submac:ident!( $($args:tt)* ) >> $($rest:tt)*) => ( { match $submac!($i, $($args)*) { $crate::IResult::Error(e) => $crate::IResult::Error(e), $crate::IResult::Incomplete($crate::Needed::Unknown) => $crate::IResult::Incomplete($crate::Needed::Unknown), $crate::IResult::Incomplete($crate::Needed::Size(i)) => $crate::IResult::Incomplete($crate::Needed::Size($consumed + i)), $crate::IResult::Done(i,o) => { let $field = o; do_parse!(__impl i, $consumed + ($crate::InputLength::input_len(&($i)) - $crate::InputLength::input_len(&i)), $($rest)*) }, } } ); // ending the chain (__impl $i:expr, $consumed:expr, $e:ident >> ( $($rest:tt)* )) => ( do_parse!(__impl $i, $consumed, call!($e) >> ( $($rest)* )); ); (__impl $i:expr, $consumed:expr, $submac:ident!( $($args:tt)* ) >> ( $($rest:tt)* )) => ( match $submac!($i, $($args)*) { $crate::IResult::Error(e) => $crate::IResult::Error(e), $crate::IResult::Incomplete($crate::Needed::Unknown) => $crate::IResult::Incomplete($crate::Needed::Unknown), $crate::IResult::Incomplete($crate::Needed::Size(i)) => $crate::IResult::Incomplete($crate::Needed::Size($consumed + i)), $crate::IResult::Done(i,_) => { $crate::IResult::Done(i, ( $($rest)* )) }, } ); (__impl $i:expr, $consumed:expr, $field:ident : $e:ident >> ( $($rest:tt)* )) => ( do_parse!(__impl $i, $consumed, $field: call!($e) >> ( $($rest)* ) ); ); (__impl $i:expr, $consumed:expr, $field:ident : $submac:ident!( $($args:tt)* ) >> ( $($rest:tt)* )) => ( match $submac!($i, $($args)*) { $crate::IResult::Error(e) => $crate::IResult::Error(e), $crate::IResult::Incomplete($crate::Needed::Unknown) => $crate::IResult::Incomplete($crate::Needed::Unknown), $crate::IResult::Incomplete($crate::Needed::Size(i)) => $crate::IResult::Incomplete($crate::Needed::Size($consumed + i)), $crate::IResult::Done(i,o) => { let $field = o; $crate::IResult::Done(i, ( $($rest)* )) }, } ); ($i:expr, $($rest:tt)*) => ( { do_parse!(__impl $i, 0usize, $($rest)*) } ); ); ... // To create a parser for integer named!(integer<&[u8], i64>, map!( pair!( opt!(sign), map_res!(map_res!(digit, str::from_utf8), i64::from_str) ), |(sign, value): (Option, i64)| { sign.unwrap_or(1) * value } )); ``` *Pros*: Can create DSL syntax, high performance. *Cons*: Macros themselves are difficult to read, write and debug. According to above comparison, parser as struct is the best approach. At first I choose to use nom to create a PDF parser, it turns out a special PDF feature blocked me. When parsing a PDF stream object, it's length may be a referenced object, hence the need to get the length from a reader. The `named! ` macro cannot accept extra parameters, there is no obvious way to read a length object inside a stream object parser. This is the primary reason why I started to develop pom. ## List of predefined parsers and combinators in pom | Basic Parsers | Description | | -------------- | ---------------------------------------- | | empty() | Always succeeds, consume no input. | | end() | Match end of input. | | sym(t) | Match a single terminal symbol *t*. | | seq(s) | Match sequence of symbols. | | list(p,s) | Match list of *p*, separated by *s*. | | one_of(set) | Success when current input symbol is one of the set. | | none_of(set) | Success when current input symbol is none of the set. | | is_a(predicate) | Success when predicate return true on current input symbol. | | not_a(predicate) | Success when predicate return false on current input symbol. | | take(n) | Read *n* symbols. | | skip(n) | Skip *n* symbols. | | call(pf) | Call a parser factory, can used to create recursive parsers. | These are functions to create basic parsers. | Parser Combinators | Description | | ------------------ | ---------------------------------------- | | p | q | Match p or q, return result of the first success. | | p + q | Match p and q, if both success return a pair of results. | | p - q | Match p and q, if both success return result of p. | | p * q | Match p and q, if both success return result of q. | | p >> q | Parse p and get result P, then parse and return result of q(P). | | -p | Success when p success, doen't consume input. | | !p | Success when p fail, doen't consume input. | | p.opt() | Make parser optional. | | p.repeat(m..n) | `p.repeat(0..)` repeat p zero or more times
`p.repeat(1..)` repeat p one or more times
`p.repeat(1..4)` match p at least 1 and at most 3 times
`p.repeat(1..=3)` also match p at least 1 and at most 3 times | | p.map(f) | Convert parser result to desired value. | | p.convert(f) | Convert parser result to desired value, fail in case of conversion error. | | p.pos() | Get input position after matching p. | | p.collect() | Collect all matched input symbols. | | p.discard() | Discard parser output. | | p.name(_) | Give parser a name to identify parsing errors. | These are operations to create new parsers based on other parsers. The choice of operators is established by their operator precedence, arity and "meaning". Use `*` to ignore the result of first operand on the start of an expression, `+` and `-` can fulfill the need on the rest of the expression. For example, `A * B * C - D + E - F` will return the results of C and E as a pair. ## Using the code There are three ways to create a parser: 1. As a variable, normally used to construct another parser. ```rust let integer = one_of(b"+-").opt() + one_of(b"0123456789").repeat(1..); ``` 2. As a closure, when referenced several times in constructing another parser. ```rust let integer = || one_of(b"+-").opt() + one_of(b"0123456789").repeat(1..); let pair = sym(b'(') * integer() - sym(b',') + integer() - sym(b')'); ``` 3. As a function, provides a high level construct. ```rust fn integer() -> Parser, Vec)> { one_of(b"+-").opt() + one_of(b"0123456789").repeat(1..) } ``` ## Example JSON Parser Let me explain the parser combinators in more detail by creating a JSON parser. Syntax diagrams can be found on [json.org](http://www.json.org/). ```rust extern crate pom; use pom::{Parser, DataInput}; use pom::char_class::hex_digit; use pom::parser::*; use std::str::FromStr; use std::char::{decode_utf16, REPLACEMENT_CHARACTER}; use std::collections::HashMap; #[derive(Debug, PartialEq)] pub enum JsonValue { Null, Bool(bool), Str(String), Num(f64), Array(Vec), Object(HashMap) } ``` Import predefined parser combinators and utility functions, define the JSON parser's output value as an enum. ```rust fn space() -> Parser { one_of(b" \t\r\n").repeat(0..).discard() } ``` Match zero or more space characters, the output is ignored. ```rust fn number() -> Parser { let integer = one_of(b"123456789") - one_of(b"0123456789").repeat(0..) | sym(b'0'); let frac = sym(b'.') + one_of(b"0123456789").repeat(1..); let exp = one_of(b"eE") + one_of(b"+-").opt() + one_of(b"0123456789").repeat(1..); let number = sym(b'-').opt() + integer + frac.opt() + exp.opt(); number.collect().convert(String::from_utf8).convert(|s|f64::from_str(&s)) } ``` Don't care each output of integer, frac or exp, collect() method get all the match character as a Vec, then it is converted to a string, and further converted to a float number. ```rust fn string() -> Parser { let special_char = sym(b'\\') | sym(b'/') | sym(b'"') | sym(b'b').map(|_|b'\x08') | sym(b'f').map(|_|b'\x0C') | sym(b'n').map(|_|b'\n') | sym(b'r').map(|_|b'\r') | sym(b't').map(|_|b'\t'); let escape_sequence = sym(b'\\') * special_char; let char_string = (none_of(b"\\\"") | escape_sequence).repeat(1..).convert(String::from_utf8); let utf16_char = seq(b"\\u") * is_a(hex_digit).repeat(4).convert(String::from_utf8).convert(|digits|u16::from_str_radix(&digits, 16)); let utf16_string = utf16_char.repeat(1..).map(|chars|decode_utf16(chars).map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)).collect::()); let string = sym(b'"') * (char_string | utf16_string).repeat(0..) - sym(b'"'); string.map(|strings|strings.concat()) } ``` The bulk of code is written to parse escape sequences. According to [Wikipedia](https://en.wikipedia.org/wiki/JSON#Data_portability_issues), UTF-16 surrogate pairs is a detail missed by some JSON parsers. We implement this easily with Rust's Unicode support. ```rust fn array() -> Parser> { let elems = list(call(value), sym(b',') * space()); sym(b'[') * space() * elems - sym(b']') } fn object() -> Parser> { let member = string() - space() - sym(b':') - space() + call(value); let members = list(member, sym(b',') * space()); let obj = sym(b'{') * space() * members - sym(b'}'); obj.map(|members|members.into_iter().collect::>()) } fn value() -> Parser { ( seq(b"null").map(|_|JsonValue::Null) | seq(b"true").map(|_|JsonValue::Bool(true)) | seq(b"false").map(|_|JsonValue::Bool(false)) | number().map(|num|JsonValue::Num(num)) | string().map(|text|JsonValue::Str(text)) | array().map(|arr|JsonValue::Array(arr)) | object().map(|obj|JsonValue::Object(obj)) ) - space() } ``` array and object are very straight to parse, notice `call(value)`, at the first attempt I write it as `value()`, then an infinite loop is created. Recursive parsing is solved by adding `call()` to `pom`. ```rust pub fn json() -> Parser { space() * value() - end() } ``` The final JSON parser, declared as public. According to [RFC 7159](https://tools.ietf.org/html/rfc7159) a JSON text is a serialized value of any of the six types. `end()` is used to ensure there is no extra text in the input. ```rust fn main() { let test = br#" { "Image": { "Width": 800, "Height": 600, "Title": "View from 15th Floor", "Thumbnail": { "Url": "http://www.example.com/image/481989943", "Height": 125, "Width": 100 }, "Animated" : false, "IDs": [116, 943, 234, 38793] }, "escaped characters": "\u2192\uD83D\uDE00\"\t\uD834\uDD1E" }"#; let mut input = DataInput::new(test); println!("{:?}", json().parse(&mut input)); } ``` Use the JSON parser to parse JSON text, the output is: ``` cargo run --example json Compiling pom v0.6.0 (file:///work/pom) Finished debug [unoptimized + debuginfo] target(s) in 2.20 secs Running `target/debug/examples/json` Ok(Object({"Image": Object({"Width": Num(800), "Title": Str("View from 15th Floor"), "Height": Num(600), "Animated": Bool(false), "IDs": Array([Num(116), Num(943), Num(234), Num(38793)]), "Thumbnail": Object({"Height ": Num(125), "Url": Str("http://www.example.com/image/481989943"), "Width": Num(100)})}), "escaped characters": Str("→😀\"\t𝄞")})) ``` The above parser assumes that the input bytes is UTF-8 encoded text; otherwise, you can use the [char version of JSON parser](https://github.com/J-F-Liu/pom/blob/master/examples/json_char.rs). `p >> q` is not covered in the JSON example. It is used to pass the output of `p` into parser creation of `p`. ```rust let mut input = DataInput::new(b"5oooooooo"); let parser = one_of(b"0123456789").map(|c|c - b'0') >> |n| { take(n as usize) + sym(b'o').repeat(0..) }; let output = parser.parse(&mut input); assert_eq!(output, Ok( (vec![b'o';5], vec![b'o';3]) )); ``` The first character indicates the number of `o`s to parse, then the number is used in the closure `|n| take(n)`. ## More examples - A [simple PDF parser](https://github.com/J-F-Liu/lopdf/blob/491dece5867a2b81878208bcb5e07ff1007c0d89/src/parser.rs), you can compare it with the equivalent [nom version](https://github.com/J-F-Liu/lopdf/blob/dff82c49fea9ac9ea23edf42ad80e480bd5edb46/src/parser.rs). - A [complete PDF parser](https://github.com/J-F-Liu/lopdf/blob/master/src/parser.rs) which can read length object when parsing stream object. ## Conclusion I think I created something really cool, you can use pom to write all kinds of parsers elegantly. I helped pom to evolve version by version into what it is, and pom also helps me to grow my Rust programming skills a lot. Of course there is still room for improvement, any feed back is welcome. ## Points of interest I try to add a `cache()` method to `Parser`. Memorize the result on given input position, return the result directly when called again, effectively implementing the Packrat Parsing algorithm. But there are two problems, 1) save result means mutate a Hashmap, so Parser's method field should be a Box of `FnMut`, 2) Hashmap returns an reference of value for a given key, the value cannot be moved, so need to make the value cloneable. ## Pain points where Rust needs to improve 1. Implement trait for `[T]` should automatically implement `[T; N]`. 2. The standard library should provide a char_at() method return the char and the number of bytes consumed, like: ```rust pub trait Encoding { /// Get char at a byte index, return the char and the number of bytes read. fn char_at(&self, data: &[u8], index: usize) -> Result<(char, u32)>; } ``` 3. Can ellide 'static lifetime parameter, allow `Parser<'static, I, O>` written as `Parser`. 4. Should `impl Copy for closure`, so that FnOnce closure can be passed to map() inside Fn closure. ```rust pub fn map(self, f: F) -> Parser<'a, I, U> where F: FnOnce(O) -> U + Copy + 'a, I: 'static, O: 'static, U: 'static { Parser::new(move |input: &mut Input| { self.parse(input).map(f) }) } ``` ## More Readings - [The Rust programming language, in the words of its practitioners](https://brson.github.io/fireflowers/) - [PEGs, Packrats and Parser Combinators](http://scg.unibe.ch/download/lectures/cc2011/10PEGs.pptx.pdf) - [An introduction to parsing text in Haskell with Parsec](http://unbui.lt/#!/post/haskell-parsec-basics/) ================================================ FILE: examples/duration.rs ================================================ use pom::parser::*; use pom::Parser; use std::str::{self, FromStr}; #[derive(Debug, PartialEq)] struct Duration { years: Option, months: Option, weeks: Option, days: Option, hours: Option, minutes: Option, seconds: Option, } fn number_separator() -> Parser { // either '.' or ',' can be used as a separator between the whole and decimal part of a number one_of(b".,").discard() } fn number() -> Parser { let integer = one_of(b"0123456789").repeat(0..); let frac = number_separator() + one_of(b"0123456789").repeat(1..); let number = integer + frac.opt(); number .collect() .convert(str::from_utf8) .convert(f32::from_str) } fn date_part() -> Parser, Option, Option, Option)> { ((number() - sym(b'Y')).opt() + (number() - sym(b'M')).opt() + (number() - sym(b'W')).opt() + (number() - sym(b'D')).opt()) .map(|(((years, months), weeks), days)| (years, months, weeks, days)) } fn time_part() -> Parser, Option, Option)> { sym(b'T') * ((number() - sym(b'H')).opt() + (number() - sym(b'M')).opt() + (number() - sym(b'S')).opt()) .map(|((hours, minutes), seconds)| (hours, minutes, seconds)) } fn parser() -> Parser { sym(b'P') * (time_part().map(|(hours, minutes, seconds)| Duration { years: None, months: None, weeks: None, days: None, hours, minutes, seconds, }) | (date_part() + time_part()).map(|(date_elements, time_elements)| { let (years, months, weeks, days) = date_elements; let (hours, minutes, seconds) = time_elements; Duration { years, months, weeks, days, hours, minutes, seconds, } })) } /// Parses the ISO 8601 Duration standard /// https://en.wikipedia.org/wiki/ISO_8601#Durations fn main() { let input = "P3Y6M4DT12H30M5S"; let result = parser().parse(input.as_bytes()); assert_eq!( Duration { years: Some(3f32), months: Some(6f32), weeks: None, days: Some(4f32), hours: Some(12f32), minutes: Some(30f32), seconds: Some(5f32) }, result.unwrap() ); } ================================================ FILE: examples/json.rs ================================================ use pom::char_class::hex_digit; use pom::parser::*; use std::char::{decode_utf16, REPLACEMENT_CHARACTER}; use std::collections::HashMap; use std::str::{self, FromStr}; #[derive(Debug, PartialEq)] pub enum JsonValue { Null, Bool(bool), Str(String), Num(f64), Array(Vec), Object(HashMap), } fn space<'a>() -> Parser<'a, u8, ()> { one_of(b" \t\r\n").repeat(0..).discard() } fn number<'a>() -> Parser<'a, u8, f64> { let integer = one_of(b"123456789") - one_of(b"0123456789").repeat(0..) | sym(b'0'); let frac = sym(b'.') + one_of(b"0123456789").repeat(1..); let exp = one_of(b"eE") + one_of(b"+-").opt() + one_of(b"0123456789").repeat(1..); let number = sym(b'-').opt() + integer + frac.opt() + exp.opt(); number .collect() .convert(str::from_utf8) .convert(f64::from_str) } fn string<'a>() -> Parser<'a, u8, String> { let special_char = sym(b'\\') | sym(b'/') | sym(b'"') | sym(b'b').map(|_| b'\x08') | sym(b'f').map(|_| b'\x0C') | sym(b'n').map(|_| b'\n') | sym(b'r').map(|_| b'\r') | sym(b't').map(|_| b'\t'); let escape_sequence = sym(b'\\') * special_char; let char_string = (none_of(b"\\\"") | escape_sequence) .repeat(1..) .convert(String::from_utf8); let utf16_char = seq(b"\\u") * is_a(hex_digit) .repeat(4) .convert(String::from_utf8) .convert(|digits| u16::from_str_radix(&digits, 16)); let utf16_string = utf16_char.repeat(1..).map(|chars| { decode_utf16(chars) .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) .collect::() }); let string = sym(b'"') * (char_string | utf16_string).repeat(0..) - sym(b'"'); string.map(|strings| strings.concat()) } fn array<'a>() -> Parser<'a, u8, Vec> { let elems = list(call(value), sym(b',') * space()); sym(b'[') * space() * elems - sym(b']') } fn object<'a>() -> Parser<'a, u8, HashMap> { let member = string() - space() - sym(b':') - space() + call(value); let members = list(member, sym(b',') * space()); let obj = sym(b'{') * space() * members - sym(b'}'); obj.map(|members| members.into_iter().collect::>()) } fn value<'a>() -> Parser<'a, u8, JsonValue> { (seq(b"null").map(|_| JsonValue::Null) | seq(b"true").map(|_| JsonValue::Bool(true)) | seq(b"false").map(|_| JsonValue::Bool(false)) | number().map(|num| JsonValue::Num(num)) | string().map(|text| JsonValue::Str(text)) | array().map(|arr| JsonValue::Array(arr)) | object().map(|obj| JsonValue::Object(obj))) - space() } pub fn json<'a>() -> Parser<'a, u8, JsonValue> { space() * value() - end() } #[allow(dead_code)] fn main() { let input = br#" { "Image": { "Width": 800, "Height": 600, "Title": "View from 15th Floor", "Thumbnail": { "Url": "http://www.example.com/image/481989943", "Height": 125, "Width": 100 }, "Animated" : false, "IDs": [116, 943, 234, 38793] }, "escaped characters": "\u2192\uD83D\uDE00\"\t\uD834\uDD1E" }"#; println!("{:?}", json().parse(input)); } ================================================ FILE: examples/json_char.rs ================================================ use pom::parser::*; use std::char::{decode_utf16, REPLACEMENT_CHARACTER}; use std::collections::HashMap; use std::iter::FromIterator; use std::str::FromStr; #[derive(Debug, PartialEq)] pub enum JsonValue { Null, Bool(bool), Str(String), Num(f64), Array(Vec), Object(HashMap), } fn space<'a>() -> Parser<'a, char, ()> { one_of(" \t\r\n").repeat(0..).discard() } fn number<'a>() -> Parser<'a, char, f64> { let integer = one_of("123456789") - one_of("0123456789").repeat(0..) | sym('0'); let frac = sym('.') + one_of("0123456789").repeat(1..); let exp = one_of("eE") + one_of("+-").opt() + one_of("0123456789").repeat(1..); let number = sym('-').opt() + integer + frac.opt() + exp.opt(); number .collect() .map(String::from_iter) .convert(|s| f64::from_str(&s)) } fn string<'a>() -> Parser<'a, char, String> { let special_char = sym('\\') | sym('/') | sym('"') | sym('b').map(|_| '\x08') | sym('f').map(|_| '\x0C') | sym('n').map(|_| '\n') | sym('r').map(|_| '\r') | sym('t').map(|_| '\t'); let escape_sequence = sym('\\') * special_char; let char_string = (none_of("\\\"") | escape_sequence) .repeat(1..) .map(String::from_iter); let utf16_char = tag("\\u") * is_a(|c: char| c.is_digit(16)) .repeat(4) .map(String::from_iter) .convert(|digits| u16::from_str_radix(&digits, 16)); let utf16_string = utf16_char.repeat(1..).map(|chars| { decode_utf16(chars) .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) .collect::() }); let string = sym('"') * (char_string | utf16_string).repeat(0..) - sym('"'); string.map(|strings| strings.concat()) } fn array<'a>() -> Parser<'a, char, Vec> { let elems = list(call(value), sym(',') * space()); sym('[') * space() * elems - sym(']') } fn object<'a>() -> Parser<'a, char, HashMap> { let member = string() - space() - sym(':') - space() + call(value); let members = list(member, sym(',') * space()); let obj = sym('{') * space() * members - sym('}'); obj.map(|members| members.into_iter().collect::>()) } fn value<'a>() -> Parser<'a, char, JsonValue> { (tag("null").map(|_| JsonValue::Null) | tag("true").map(|_| JsonValue::Bool(true)) | tag("false").map(|_| JsonValue::Bool(false)) | number().map(|num| JsonValue::Num(num)) | string().map(|text| JsonValue::Str(text)) | array().map(|arr| JsonValue::Array(arr)) | object().map(|obj| JsonValue::Object(obj))) - space() } pub fn json<'a>() -> Parser<'a, char, JsonValue> { space() * value() - end() } #[allow(dead_code)] fn main() { let test = r#" { "Image": { "Width": 800, "Height": 600, "Title": "View from 15th Floor", "Thumbnail": { "Url": "http://www.example.com/image/481989943", "Height": 125, "Width": 100 }, "Animated" : false, "IDs": [116, 943, 234, 38793] }, "escaped characters": "\u2192\uD83D\uDE00\"\t\uD834\uDD1E" }"#; let input: Vec = test.chars().collect(); println!("{:?}", json().parse(&input)); } ================================================ FILE: examples/json_file.rs ================================================ use pom::char_class::hex_digit; use pom::parser::{call, end, is_a, list, none_of, one_of, seq, sym, Parser}; use std::char::{decode_utf16, REPLACEMENT_CHARACTER}; use std::collections::HashMap; use std::fs::File; use std::io::Read; use std::str::{self, FromStr}; #[derive(Debug, PartialEq)] pub enum JsonValue { Null, Bool(bool), Str(String), Num(f64), Array(Vec), Object(HashMap), } fn space<'a>() -> Parser<'a, u8, ()> { one_of(b" \t\r\n").repeat(0..).discard() } fn number<'a>() -> Parser<'a, u8, f64> { let integer = one_of(b"123456789") - one_of(b"0123456789").repeat(0..) | sym(b'0'); let frac = sym(b'.') + one_of(b"0123456789").repeat(1..); let exp = one_of(b"eE") + one_of(b"+-").opt() + one_of(b"0123456789").repeat(1..); let number = sym(b'-').opt() + integer + frac.opt() + exp.opt(); number .collect() .convert(str::from_utf8) .convert(f64::from_str) } fn string<'a>() -> Parser<'a, u8, String> { let special_char = sym(b'\\') | sym(b'/') | sym(b'"') | sym(b'b').map(|_| b'\x08') | sym(b'f').map(|_| b'\x0C') | sym(b'n').map(|_| b'\n') | sym(b'r').map(|_| b'\r') | sym(b't').map(|_| b'\t'); let escape_sequence = sym(b'\\') * special_char; let char_string = (none_of(b"\\\"") | escape_sequence) .repeat(1..) .convert(String::from_utf8); let utf16_char = seq(b"\\u") * is_a(hex_digit) .repeat(4) .convert(String::from_utf8) .convert(|digits| u16::from_str_radix(&digits, 16)); let utf16_string = utf16_char.repeat(1..).map(|chars| { decode_utf16(chars) .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) .collect::() }); let string = sym(b'"') * (char_string | utf16_string).repeat(0..) - sym(b'"'); string.map(|strings| strings.concat()) } fn array<'a>() -> Parser<'a, u8, Vec> { let elems = list(call(value), sym(b',') * space()); sym(b'[') * space() * elems - sym(b']') } fn object<'a>() -> Parser<'a, u8, HashMap> { let member = string() - space() - sym(b':') - space() + call(value); let members = list(member, sym(b',') * space()); let obj = sym(b'{') * space() * members - sym(b'}'); obj.map(|members| members.into_iter().collect::>()) } fn value<'a>() -> Parser<'a, u8, JsonValue> { (seq(b"null").map(|_| JsonValue::Null) | seq(b"true").map(|_| JsonValue::Bool(true)) | seq(b"false").map(|_| JsonValue::Bool(false)) | number().map(|num| JsonValue::Num(num)) | string().map(|text| JsonValue::Str(text)) | array().map(|arr| JsonValue::Array(arr)) | object().map(|obj| JsonValue::Object(obj))) - space() } pub fn json<'a>() -> Parser<'a, u8, JsonValue> { space() * value() - end() } #[allow(dead_code)] fn main() { let mut file = File::open("examples/test.json").unwrap(); let mut input: Vec = Vec::new(); file.read_to_end(&mut input).expect("read test.json"); println!("{:?}", json().parse(input.as_slice())); } ================================================ FILE: examples/simple.rs ================================================ use pom::parser::*; fn main() { let input = b"abcde"; let parser = sym(b'a') * none_of(b"AB") - sym(b'c') + seq(b"de"); let output = parser.parse(input); // assert_eq!(output, Ok( (b'b', &b"de"[..]) ) ); println!("{:?}", output); } ================================================ FILE: examples/test.json ================================================ { "Image": { "Width": 800, "Height": 600, "Title": "View from 15th Floor", "Thumbnail": { "Url": "http://www.example.com/image/481989943", "Height": 125, "Width": 100 }, "Animated" : false, "IDs": [116, 943, 234, 38793] }, "escaped characters": "\u2192\uD83D\uDE00\"\t\uD834\uDD1E" } ================================================ FILE: examples/utf8.rs ================================================ // Example shows basic UTF-8 combinators use pom::utf8::*; fn main() { // Informal, Spanish-language movie database format let input = "\ Título: Abre los ojos Año: 1997 Director: Alejandro Amenábar Título: Amores Perros Director: Alejandro González Iñárritu Año: 2000 Título: La montaña sagrada Año: 1973 Director: Alejandro Jodorowsky "; enum DataLine<'a> { Title(&'a str), Director(&'a str), Year(i32), } fn positive<'a>() -> Parser<'a, i32> { // let integer = (one_of("123456789") - one_of("0123456789").repeat(0..)) | sym(b'0'); // TODO let digit = one_of("0123456789"); let integer = digit.discard().repeat(1..); integer.collect().convert(|x| x.parse::()) } fn rest_str<'a>() -> Parser<'a, &'a str> { any().repeat(1..).collect() } fn separator<'a>() -> Parser<'a, ()> { seq(": ").discard() } let parser = (seq("Título") * separator() * rest_str().map(|s| DataLine::Title(s))) | (seq("Director") * separator() * rest_str().map(|s| DataLine::Director(s))) | (seq("Año") * separator() * positive().map(|i| DataLine::Year(i))); { let mut title_opt: Option<&str> = None; let mut year_opt: Option = None; let mut director_opt: Option<&str> = None; for line in input.lines() { if !line.is_empty() { // Skip blank lines without parsing // Parse line match parser.parse_str(line).unwrap() { DataLine::Title(s) => title_opt = Some(s), DataLine::Director(s) => director_opt = Some(s), DataLine::Year(s) => year_opt = Some(s), } // When all three line types have been collected, print them if let (Some(title), Some(year), Some(director)) = (title_opt, year_opt, director_opt) { println!("Title: {}\nDirector: {}\nYear: {}\n", title, director, year); (title_opt, year_opt, director_opt) = (None, None, None); } } } } } ================================================ FILE: examples/utf8_mixed.rs ================================================ // Example shows UTF-8 combinators intermixed with binary combinators use pom::parser::*; use pom::utf8; fn main() { // A parser for MsgPack (but only messages encoding a string) let testcases: [Vec; 6] = [ vec![0b10100100, 0b11110000, 0b10011111, 0b10100100, 0b10010100], // 🤔, max-size 31 format vec![0xd9, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // 😮, max-size 255 format vec![0xda, 0, 4, 0b11110000, 0b10011111, 0b10100100, 0b10101111], // 🤯, max-size 2^16-1 format vec![ 0xdb, 0, 0, 0, 4, 0b11110000, 0b10011111, 0b10010010, 0b10100101, ], // 💥, max-size 2^32-1 format vec![0xc4, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // Valid MsgPack, but not a string (binary) vec![0b10100100, 0b10010100, 0b10100100, 0b10011111, 0b11110000], // A MsgPack string, but invalid UTF-8 ]; const MASK: u8 = 0b11100000; // size 31 format is denoted by 3 high bits == 101 const SIZE_31: u8 = 0b10100000; fn rest_as_str<'a>() -> utf8::Parser<'a, &'a str> { utf8::any().repeat(0..).collect() } // Demo parser does not verify that the claimed length matches the actual length (but checking so is simple with >>) let parser = (sym(0xdb) * any().repeat(4) * rest_as_str()) // 2^32-1 format | (sym(0xda) * any().repeat(2) * rest_as_str()) // 2^16-1 format | (sym(0xd9) * any() * rest_as_str()) // 255 format | (is_a(|x| x&MASK == SIZE_31) * rest_as_str()) // 31 format - end(); for testcase in testcases.iter() { println!("{:?}", parser.parse(testcase)); } } ================================================ FILE: examples/whitespace.rs ================================================ use pom::parser::*; #[derive(Clone, Debug, PartialEq)] struct Container { containers: Vec, contents: Vec, } enum TmpContainerOrContent { Container(Container), Content(String), } fn whitespace<'a>() -> Parser<'a, u8, ()> { one_of(b" \t\r\n").repeat(0..).discard() } fn linebreak<'a>() -> Parser<'a, u8, ()> { sym(b'\r').opt() * sym(b'\n').discard() } fn indented<'a>() -> Parser<'a, u8, Vec> { sym(b'\t') * none_of(b"\n\r").repeat(1..) - linebreak() } fn empty<'a>() -> Parser<'a, u8, ()> { one_of(b" \t").repeat(0..).discard() - linebreak() } fn content<'a>() -> Parser<'a, u8, String> { none_of(b" \t\r\n").repeat(1..).convert(String::from_utf8) - linebreak() } fn subcontainer<'a>() -> Parser<'a, u8, (Vec, Vec)> { (call(container).map(|ctr| TmpContainerOrContent::Container(ctr)) | content().map(|ctn| TmpContainerOrContent::Content(ctn))) .repeat(1..) .map(|tmp| { tmp.into_iter().fold((vec![], vec![]), |acc, x| match x { TmpContainerOrContent::Container(ct) => ( acc.0.into_iter().chain(vec![ct].into_iter()).collect(), acc.1, ), TmpContainerOrContent::Content(cn) => ( acc.0, acc.1.into_iter().chain(vec![cn].into_iter()).collect(), ), }) }) } fn container<'a>() -> Parser<'a, u8, Container> { seq(b"Container\n") * (indented() | empty().map(|()| vec![])) .repeat(1..) .map(|lines| { lines .into_iter() .filter(|line| line.len() > 0) .fold(vec![], |accum, line| { accum .into_iter() .chain(line.into_iter().chain(vec![b'\n'].into_iter())) .collect() }) }) .map(|deden| subcontainer().parse(&deden).expect("subcont")) .map(|(containers, contents)| Container { containers, contents, }) } fn mylang<'a>() -> Parser<'a, u8, Vec> { whitespace() * list(call(container), whitespace()) } fn main() -> Result<(), ()> { let input = br#" Container Container a b c 1 2 3 Container q Container foo bar Container baz quux "#; assert_eq!( mylang().parse(input), Ok(vec![ Container { containers: vec![ Container { containers: vec![], contents: vec!["a".into(), "b".into(), "c".into(),] }, Container { containers: vec![], contents: vec!["q".into(),] } ], contents: vec!["1".into(), "2".into(), "3".into(),] }, Container { containers: vec![Container { contents: vec!["baz".into(), "quux".into(),], containers: vec![], },], contents: vec!["foo".into(), "bar".into(),] }, ]) ); Ok(()) } ================================================ FILE: rustfmt.toml ================================================ format_strings = false reorder_imports = true hard_tabs = true ================================================ FILE: src/char_class.rs ================================================ /// Recognises an alphabetic character, `a-zA-Z`. #[inline] pub fn alpha(term: u8) -> bool { term.is_ascii_alphabetic() } /// Recognises an alphabetic character, `A-Z`. #[inline] pub fn alpha_uppercase(term: u8) -> bool { term.is_ascii_uppercase() } /// Recognises an alphabetic character, `a-z`. #[inline] pub fn alpha_lowercase(term: u8) -> bool { term.is_ascii_lowercase() } /// Recognises a decimal digit, `0-9`. #[inline] pub fn digit(term: u8) -> bool { term.is_ascii_digit() } /// Recognises an alphanumeric character, `a-zA-Z0-9`. #[inline] pub fn alphanum(term: u8) -> bool { term.is_ascii_alphanumeric() } /// Recognises a hexadecimal digit, `0-9a-fA-F`. #[inline] pub fn hex_digit(term: u8) -> bool { matches!(term, 0x30..=0x39 | 0x41..=0x46 | 0x61..=0x66) } /// Recognises an octal digit, `0-7`. #[inline] pub fn oct_digit(term: u8) -> bool { matches!(term, 0x30..=0x37) } /// Recognises a space or tab. #[inline] pub fn space(term: u8) -> bool { matches!(term, b' ' | b'\t') } /// Recognises a space, tab, line feed, or carriage return. #[inline] pub fn multispace(term: u8) -> bool { space(term) || matches!(term, b'\n' | b'\r') } #[cfg(test)] mod test { use super::*; #[test] fn is_an_alpha() { assert!(alpha(b'A')); assert!(alpha(b'Z')); assert!(alpha(b'a')); assert!(alpha(b'z')); } #[test] fn is_an_alpha_uppercase() { assert!(alpha_uppercase(b'A')); assert!(alpha_uppercase(b'Z')); assert!(!alpha_uppercase(b'a')); assert!(!alpha_uppercase(b'z')); } #[test] fn is_an_alpha_lowercase() { assert!(!alpha_lowercase(b'A')); assert!(!alpha_lowercase(b'Z')); assert!(alpha_lowercase(b'a')); assert!(alpha_lowercase(b'z')); } #[test] fn is_a_digit() { assert!(digit(b'0')); assert!(digit(b'9')); assert!(!digit(b'A')); } #[test] fn is_an_alphanum() { assert!(alphanum(b'A')); assert!(alphanum(b'Z')); assert!(alphanum(b'a')); assert!(alphanum(b'z')); assert!(alphanum(b'0')); assert!(alphanum(b'9')); assert!(!alphanum(b'#')); } #[test] fn is_a_hex_digit() { assert!(hex_digit(b'0')); assert!(hex_digit(b'9')); assert!(hex_digit(b'A')); assert!(hex_digit(b'F')); assert!(hex_digit(b'a')); assert!(hex_digit(b'f')); assert!(!hex_digit(b'G')); } #[test] fn is_a_oct_digit() { assert!(oct_digit(b'0')); assert!(oct_digit(b'7')); assert!(!oct_digit(b'8')); assert!(!oct_digit(b'9')); } #[test] fn is_space() { assert!(space(b' ')); assert!(space(b'\t')); assert!(!space(b'\n')); assert!(!space(b'A')); } #[test] fn is_multispace() { assert!(multispace(b' ')); assert!(multispace(b'\t')); assert!(multispace(b'\n')); assert!(!multispace(b'A')); } } ================================================ FILE: src/lib.rs ================================================ pub(crate) mod range; mod result; pub(crate) mod set; /// Contains predefined parsers and combinators. pub mod parser; /// Utility functions to recognize char class of byte value. pub mod char_class; /// Variants of parser functions specialized for matching UTF-8 strings and returning chars. /// Method and constructor names/functionality are generally the same as in base parser module. #[cfg(feature = "utf8")] pub mod utf8; pub use crate::result::{Error, Result}; /// Parser type, `Parser` is alias of `parser::Parser<'static, I, O>`. pub type Parser = parser::Parser<'static, I, O>; ================================================ FILE: src/parser.rs ================================================ use super::{Error, Result}; use crate::{range::RangeArgument, set::Set}; use std::{ fmt::{Debug, Display}, ops::Bound::{Excluded, Included, Unbounded}, ops::{Add, BitOr, Mul, Neg, Not, Shr, Sub}, }; type Parse<'a, I, O> = dyn Fn(&'a [I], usize) -> Result<(O, usize)> + 'a; /// Parser combinator. pub struct Parser<'a, I, O> { pub method: Box>, } impl<'a, I, O> Parser<'a, I, O> { /// Create new parser. pub fn new

(parse: P) -> Self where P: Fn(&'a [I], usize) -> Result<(O, usize)> + 'a, { Self { method: Box::new(parse), } } /// Apply the parser to parse input. pub fn parse(&self, input: &'a [I]) -> Result { (self.method)(input, 0).map(|(out, _)| out) } /// Parse input at specified position. pub fn parse_at(&self, input: &'a [I], start: usize) -> Result<(O, usize)> { (self.method)(input, start) } /// Convert parser result to desired value. pub fn map(self, f: F) -> Parser<'a, I, U> where F: Fn(O) -> U + 'a, I: 'a, O: 'a, U: 'a, { Parser::new(move |input: &'a [I], start: usize| { (self.method)(input, start).map(|(out, pos)| (f(out), pos)) }) } /// Convert parser result to desired value, fail in case of conversion error. pub fn convert(self, f: F) -> Parser<'a, I, U> where F: Fn(O) -> ::std::result::Result + 'a, E: Debug, O: 'a, U: 'a, { Parser::new(move |input: &'a [I], start: usize| { (self.method)(input, start).and_then(|(res, pos)| match f(res) { Ok(out) => Ok((out, pos)), Err(err) => Err(Error::Conversion { message: format!("Conversion error: {:?}", err), position: start, }), }) }) } /// Cache parser output result to speed up backtracking. pub fn cache(self) -> Self where O: Clone + 'a, { use std::{cell::RefCell, collections::HashMap}; let results = RefCell::new(HashMap::new()); Self::new(move |input: &'a [I], start: usize| { let key = (start, format!("{:p}", &self.method)); results .borrow_mut() .entry(key) .or_insert_with(|| (self.method)(input, start)) .clone() }) } /// Get input position after matching parser. pub fn pos(self) -> Parser<'a, I, usize> where O: 'a, { Parser::new(move |input: &'a [I], start: usize| { (self.method)(input, start).map(|(_, pos)| (pos, pos)) }) } /// Collect all matched input symbols. pub fn collect(self) -> Parser<'a, I, &'a [I]> where O: 'a, { Parser::new(move |input: &'a [I], start: usize| { (self.method)(input, start).map(|(_, end)| (&input[start..end], end)) }) } /// Discard parser output. pub fn discard(self) -> Parser<'a, I, ()> where O: 'a, { Parser::new(move |input: &'a [I], start: usize| { (self.method)(input, start).map(|(_, end)| ((), end)) }) } /// Make parser optional. pub fn opt(self) -> Parser<'a, I, Option> where O: 'a, { Parser::new( move |input: &'a [I], start: usize| match (self.method)(input, start) { Ok((out, pos)) => Ok((Some(out), pos)), Err(_) => Ok((None, start)), }, ) } /// `p.repeat(5)` repeat p exactly 5 times /// `p.repeat(0..)` repeat p zero or more times /// `p.repeat(1..)` repeat p one or more times /// `p.repeat(1..4)` match p at least 1 and at most 3 times pub fn repeat(self, range: R) -> Parser<'a, I, Vec> where R: RangeArgument + Debug + 'a, O: 'a, { Parser::new(move |input: &'a [I], start: usize| { let mut items = vec![]; let mut pos = start; loop { match range.end() { Included(&max_count) => { if items.len() >= max_count { break; } } Excluded(&max_count) => { if items.len() + 1 >= max_count { break; } } Unbounded => (), } let Ok((item, item_pos)) = (self.method)(input, pos) else { break; }; items.push(item); pos = item_pos; } if let Included(&min_count) = range.start() { if items.len() < min_count { return Err(Error::Mismatch { message: format!( "expect repeat at least {} times, found {} times", min_count, items.len() ), position: start, }); } } Ok((items, pos)) }) } #[cfg(not(feature = "trace"))] /// Give parser a name to identify parsing errors. pub fn name(self, name: &'a str) -> Self where O: 'a, { Parser::new( move |input: &'a [I], start: usize| match (self.method)(input, start) { res @ Ok(_) => res, Err(err) => match err { Error::Custom { .. } => Err(err), _ => Err(Error::Custom { message: format!("failed to parse {}", name), position: start, inner: Some(Box::new(err)), }), }, }, ) } #[cfg(feature = "trace")] /// Trace parser calls and results. Similar to name pub fn name(self, name: &'a str) -> Self where O: 'a, { Parser::new(move |input: &'a [I], start: usize| { eprintln!("parse: {} ({})", name, start); match (self.method)(input, start) { res @ Ok(_) => { eprintln!(" {} ({}): ok", name, start); res } Err(err) => { eprintln!(" {} ({}): error", name, start); match err { Error::Custom { .. } => Err(err), _ => Err(Error::Custom { message: format!("failed to parse {}", name), position: start, inner: Some(Box::new(err)), }), } } } }) } /// Mark parser as expected, abort early when failed in ordered choice. pub fn expect(self, name: &'a str) -> Self where O: 'a, { Parser::new( move |input: &'a [I], start: usize| match (self.method)(input, start) { res @ Ok(_) => res, Err(err) => Err(Error::Expect { message: format!("Expect {}", name), position: start, inner: Box::new(err), }), }, ) } } /// Always succeeds, consume no input. pub fn empty<'a, I>() -> Parser<'a, I, ()> { Parser::new(|_: &[I], start: usize| Ok(((), start))) } /// Match any symbol. pub fn any<'a, I>() -> Parser<'a, I, I> where I: Clone, { Parser::new(|input: &[I], start: usize| { let Some(s) = input.get(start) else { return Err(Error::Mismatch { message: "end of input reached".to_owned(), position: start, }); }; Ok((s.clone(), start + 1)) }) } /// Success when current input symbol equals `t`. pub fn sym<'a, I>(t: I) -> Parser<'a, I, I> where I: Clone + PartialEq + Display, { Parser::new(move |input: &'a [I], start: usize| { let Some(s) = input.get(start) else { return Err(Error::Incomplete); }; if t != *s { return Err(Error::Mismatch { message: format!("expect: {}, found: {}", t, s), position: start, }); } Ok((s.clone(), start + 1)) }) } /// Success when sequence of symbols matches current input. pub fn seq<'a, 'b: 'a, I>(tag: &'b [I]) -> Parser<'a, I, &'a [I]> where I: PartialEq + Debug, { Parser::new(move |input: &'a [I], start: usize| { let mut index = 0; loop { let pos = start + index; if index == tag.len() { return Ok((tag, pos)); } let Some(s) = input.get(pos) else { return Err(Error::Incomplete); }; if tag[index] != *s { return Err(Error::Mismatch { message: format!("seq {:?} expect: {:?}, found: {:?}", tag, tag[index], s), position: pos, }); } index += 1; } }) } /// Success when tag matches current input. pub fn tag<'a, 'b: 'a>(tag: &'b str) -> Parser<'a, char, &'a str> { Parser::new(move |input: &'a [char], start: usize| { let mut pos = start; for c in tag.chars() { let Some(&s) = input.get(pos) else { return Err(Error::Incomplete); }; if c != s { return Err(Error::Mismatch { message: format!("tag {:?} expect: {:?}, found: {}", tag, c, s), position: pos, }); } pos += 1; } Ok((tag, pos)) }) } /// Parse separated list. pub fn list<'a, I, O, U>( parser: Parser<'a, I, O>, separator: Parser<'a, I, U>, ) -> Parser<'a, I, Vec> where O: 'a, U: 'a, { Parser::new(move |input: &'a [I], start: usize| { let mut items = vec![]; let mut pos = start; if let Ok((first_item, first_pos)) = (parser.method)(input, pos) { items.push(first_item); pos = first_pos; while let Ok((_, sep_pos)) = (separator.method)(input, pos) { match (parser.method)(input, sep_pos) { Ok((more_item, more_pos)) => { items.push(more_item); pos = more_pos; } Err(_) => break, } } } Ok((items, pos)) }) } /// Success when current input symbol is one of the set. pub fn one_of<'a, I, S>(set: &'a S) -> Parser<'a, I, I> where I: Clone + PartialEq + Display + Debug, S: Set + ?Sized, { Parser::new(move |input: &'a [I], start: usize| { let Some(s) = input.get(start) else { return Err(Error::Incomplete); }; if !set.contains(s) { return Err(Error::Mismatch { message: format!("expect one of: {}, found: {}", set.to_str(), s), position: start, }); }; Ok((s.clone(), start + 1)) }) } /// Success when current input symbol is none of the set. pub fn none_of<'a, I, S>(set: &'static S) -> Parser<'a, I, I> where I: Clone + PartialEq + Display + Debug, S: Set + ?Sized, { Parser::new(move |input: &'a [I], start: usize| { let Some(s) = input.get(start) else { return Err(Error::Incomplete); }; if set.contains(s) { return Err(Error::Mismatch { message: format!("expect none of: {}, found: {}", set.to_str(), s), position: start, }); } Ok((s.clone(), start + 1)) }) } /// Success when predicate returns true on current input symbol. pub fn is_a<'a, I, F>(predicate: F) -> Parser<'a, I, I> where I: Clone + PartialEq + Display + Debug, F: Fn(I) -> bool + 'a, { Parser::new(move |input: &'a [I], start: usize| { let Some(s) = input.get(start) else { return Err(Error::Incomplete); }; if !predicate(s.clone()) { return Err(Error::Mismatch { message: format!("is_a predicate failed on: {}", s), position: start, }); } Ok((s.clone(), start + 1)) }) } /// Success when predicate returns false on current input symbol. pub fn not_a<'a, I, F>(predicate: F) -> Parser<'a, I, I> where I: Clone + PartialEq + Display + Debug, F: Fn(I) -> bool + 'a, { Parser::new(move |input: &'a [I], start: usize| { let Some(s) = input.get(start) else { return Err(Error::Incomplete); }; if predicate(s.clone()) { return Err(Error::Mismatch { message: format!("not_a predicate failed on: {}", s), position: start, }); } Ok((s.clone(), start + 1)) }) } /// Read n symbols. pub fn take<'a, I>(n: usize) -> Parser<'a, I, &'a [I]> { Parser::new(move |input: &'a [I], start: usize| { let pos = start + n; if input.len() < pos { return Err(Error::Incomplete); } Ok((&input[start..pos], pos)) }) } /// Skip n symbols. pub fn skip<'a, I>(n: usize) -> Parser<'a, I, ()> { Parser::new(move |input: &'a [I], start: usize| { let pos = start + n; if input.len() < pos { return Err(Error::Incomplete); } Ok(((), pos)) }) } /// Call a parser factory, can be used to create recursive parsers. pub fn call<'a, I, O, F>(parser_factory: F) -> Parser<'a, I, O> where O: 'a, F: Fn() -> Parser<'a, I, O> + 'a, { Parser::new(move |input: &'a [I], start: usize| { let parser = parser_factory(); (parser.method)(input, start) }) } /// Success when end of input is reached. pub fn end<'a, I>() -> Parser<'a, I, ()> where I: Display, { Parser::new(|input: &'a [I], start: usize| { if let Some(s) = input.get(start) { return Err(Error::Mismatch { message: format!("expect end of input, found: {}", s), position: start, }); } Ok(((), start)) }) } /// Sequence reserve value impl<'a, I, O: 'a, U: 'a> Add> for Parser<'a, I, O> { type Output = Parser<'a, I, (O, U)>; fn add(self, other: Parser<'a, I, U>) -> Self::Output { Parser::new(move |input: &'a [I], start: usize| { (self.method)(input, start).and_then(|(out1, pos1)| { (other.method)(input, pos1).map(|(out2, pos2)| ((out1, out2), pos2)) }) }) } } /// Sequence discard second value impl<'a, I, O: 'a, U: 'a> Sub> for Parser<'a, I, O> { type Output = Parser<'a, I, O>; fn sub(self, other: Parser<'a, I, U>) -> Self::Output { Parser::new(move |input: &'a [I], start: usize| { (self.method)(input, start) .and_then(|(out1, pos1)| (other.method)(input, pos1).map(|(_, pos2)| (out1, pos2))) }) } } /// Sequence discard first value impl<'a, I: 'a, O: 'a, U: 'a> Mul> for Parser<'a, I, O> { type Output = Parser<'a, I, U>; fn mul(self, other: Parser<'a, I, U>) -> Self::Output { Parser::new(move |input: &'a [I], start: usize| { (self.method)(input, start).and_then(|(_, pos1)| (other.method)(input, pos1)) }) } } /// Chain two parsers where the second parser depends on the first's result. impl<'a, I, O: 'a, U: 'a, F: Fn(O) -> Parser<'a, I, U> + 'a> Shr for Parser<'a, I, O> { type Output = Parser<'a, I, U>; fn shr(self, other: F) -> Self::Output { Parser::new(move |input: &'a [I], start: usize| { (self.method)(input, start).and_then(|(out, pos)| (other(out).method)(input, pos)) }) } } /// Ordered choice impl<'a, I, O: 'a> BitOr for Parser<'a, I, O> { type Output = Parser<'a, I, O>; fn bitor(self, other: Parser<'a, I, O>) -> Self::Output { Parser::new( move |input: &'a [I], start: usize| match (self.method)(input, start) { Ok(out) => Ok(out), Err(err) => match err { Error::Expect { .. } => Err(err), _ => (other.method)(input, start), }, }, ) } } /// And predicate impl<'a, I, O: 'a> Neg for Parser<'a, I, O> { type Output = Parser<'a, I, bool>; fn neg(self) -> Self::Output { Parser::new(move |input: &'a [I], start: usize| { (self.method)(input, start).map(|_| (true, start)) }) } } /// Not predicate impl<'a, I, O: 'a> Not for Parser<'a, I, O> { type Output = Parser<'a, I, bool>; fn not(self) -> Self::Output { Parser::new( move |input: &'a [I], start: usize| match (self.method)(input, start) { Ok(_) => Err(Error::Mismatch { message: "not predicate failed".to_string(), position: start, }), Err(_) => Ok((true, start)), }, ) } } #[cfg(test)] mod tests { use crate::parser::*; use crate::Error; #[test] fn byte_works() { let input = b"abcde"; let parser = sym(b'a') + one_of(b"ab") - sym(b'C'); let output = parser.parse(input); assert_eq!( output, Err(Error::Mismatch { message: "expect: 67, found: 99".to_string(), position: 2 }) ); let parser = sym(b'a') * none_of(b"AB") - sym(b'c') + seq(b"de"); let output = parser.parse(input); assert_eq!(output, Ok((b'b', &b"de"[..]))); assert_eq!(parser.pos().parse(input), Ok(5)); let parser = sym(b'e') | sym(b'd').expect("d") | empty().map(|_| b'0'); let output = parser.parse(input); assert_eq!( output, Err(Error::Expect { message: "Expect d".to_owned(), position: 0, inner: Box::new(Error::Mismatch { message: "expect: 100, found: 97".to_string(), position: 0 }) }) ); } #[test] fn char_works() { let input = "abcd".chars().collect::>(); let parser = tag("ab") + sym('c') | sym('d').map(|_| ("", '0')); let output = parser.parse(&input); assert_eq!(output, Ok(("ab", 'c'))); } #[test] fn recursive_parser() { #[derive(Debug, PartialEq)] enum Expr { Empty, Group(Box), } fn expr() -> Parser<'static, u8, Expr> { (sym(b'(') + call(expr) - sym(b')')).map(|(_, e)| Expr::Group(Box::new(e))) | empty().map(|_| Expr::Empty) } let input = b"(())"; let parser = expr(); let output = parser.parse(input); assert_eq!( output, Ok(Expr::Group(Box::new(Expr::Group(Box::new(Expr::Empty))))) ); } #[test] fn chain_parser() { let input = b"5oooooooo"; { let parser = one_of(b"0123456789").map(|c| c - b'0') >> |n| take(n as usize) + sym(b'o').repeat(0..); assert_eq!(parser.parse(input), Ok((&b"ooooo"[..], vec![b'o'; 3]))); } { let parser = skip(1) * take(3) >> |v: &'static [u8]| take(v.len() + 2).map(move |u| (u, v)); assert_eq!(parser.parse(input), Ok((&b"ooooo"[..], &b"ooo"[..]))); } { let parser = Parser::new(move |input, start| { (skip(1) * take(3)) .parse_at(input, start) .and_then(|(v, pos)| { take(v.len() + 2) .parse_at(input, pos) .map(|(u, end)| ((u, v), end)) }) }); assert_eq!(parser.parse(input), Ok((&b"ooooo"[..], &b"ooo"[..]))); } } #[test] fn repeat_at_least() { let input = b"xxxooo"; { let parser = sym(b'x').repeat(1..2); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 1])) } { let parser = sym(b'x').repeat(1..); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } { let parser = sym(b'x').repeat(0..); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } { let parser = sym(b'y').repeat(0..); let output = parser.parse(input); assert_eq!(output, Ok(vec![])) } { let parser = sym(b'y').repeat(1..); let output = parser.parse(input); assert!(output.is_err()); } { let parser = sym(b'x').repeat(10..); let output = parser.parse(input); assert!(output.is_err()); } } #[test] fn repeat_up_to() { let input = b"xxxooo"; { let parser = sym(b'x').repeat(..2); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 1])) } { let parser = sym(b'x').repeat(..4); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } { let parser = sym(b'x').repeat(..); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } { let parser = sym(b'x').repeat(..0); let output = parser.parse(input); assert_eq!(output, Ok(vec![])) } { let parser = sym(b'x').repeat(..10); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } } #[test] fn repeat_up_to_inclusive() { let input = b"xxxooo"; { let parser = sym(b'x').repeat(..=2); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 2])) } { let parser = sym(b'x').repeat(..=4); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } { let parser = sym(b'x').repeat(..=0); let output = parser.parse(input); assert_eq!(output, Ok(vec![])) } { let parser = sym(b'x').repeat(..=10); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } } #[test] fn repeat_from_to_inclusive() { let input = b"xxxooo"; { let parser = sym(b'x').repeat(1..=2); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 2])) } { let parser = sym(b'x').repeat(1..=4); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } { let parser = sym(b'x').repeat(0..=0); let output = parser.parse(input); assert_eq!(output, Ok(vec![])) } { let parser = sym(b'x').repeat(3..=10); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } { let parser = sym(b'x').repeat(4..=10); let output = parser.parse(input); assert!(output.is_err()) } } #[test] fn repeat_exactly() { let input = b"xxxooo"; { let parser = sym(b'x').repeat(0); let output = parser.parse(input); assert_eq!(output, Ok(vec![])) } { let parser = sym(b'x').repeat(1); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 1])) } { let parser = sym(b'x').repeat(2); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 2])) } { let parser = sym(b'x').repeat(3); let output = parser.parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } { let parser = sym(b'x').repeat(4); let output = parser.parse(input); assert!(output.is_err()) } } #[cfg(not(feature = "trace"))] #[test] fn named() { let input = b"xxxooo"; { let parser = sym(b'x').repeat(3); let output = parser.name("name_test_ok").parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } { let parser = sym(b'x').repeat(4); let output = parser.name("name_test_err").parse(input); assert_eq!( output, Err(Error::Custom { message: "failed to parse name_test_err".into(), position: 0, inner: Some(Box::new(Error::Mismatch { message: "expect repeat at least 4 times, found 3 times".into(), position: 0 })) }) ) } } #[cfg(feature = "trace")] #[test] // Note: this doesn't test the tracing per se, just that the `name()` method executes // in the same way when the feature is turned on. fn named() { let input = b"xxxooo"; { let parser = sym(b'x').repeat(3); let output = parser.name("name_test_ok").parse(input); assert_eq!(output, Ok(vec![b'x'; 3])) } { let parser = sym(b'x').repeat(4); let output = parser.name("name_test_err").parse(input); assert_eq!( output, Err(Error::Custom { message: "failed to parse name_test_err".into(), position: 0, inner: Some(Box::new(Error::Mismatch { message: "expect repeat at least 4 times, found 3 times".into(), position: 0 })) }) ) } } } ================================================ FILE: src/range.rs ================================================ use std::ops::{Bound, RangeBounds, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}; pub trait RangeArgument { fn start(&self) -> Bound<&usize>; fn end(&self) -> Bound<&usize>; } impl RangeArgument for Range { fn start(&self) -> Bound<&usize> { self.start_bound() } fn end(&self) -> Bound<&usize> { self.end_bound() } } impl RangeArgument for RangeFrom { fn start(&self) -> Bound<&usize> { self.start_bound() } fn end(&self) -> Bound<&usize> { self.end_bound() } } impl RangeArgument for RangeFull { fn start(&self) -> Bound<&usize> { self.start_bound() } fn end(&self) -> Bound<&usize> { self.end_bound() } } impl RangeArgument for RangeInclusive { fn start(&self) -> Bound<&usize> { self.start_bound() } fn end(&self) -> Bound<&usize> { self.end_bound() } } impl RangeArgument for RangeTo { fn start(&self) -> Bound<&usize> { self.start_bound() } fn end(&self) -> Bound<&usize> { self.end_bound() } } impl RangeArgument for RangeToInclusive { fn start(&self) -> Bound<&usize> { self.start_bound() } fn end(&self) -> Bound<&usize> { self.end_bound() } } impl RangeArgument for usize { fn start(&self) -> Bound<&usize> { Bound::Included(self) } fn end(&self) -> Bound<&usize> { Bound::Included(self) } } #[cfg(test)] mod test { use super::*; fn accept(ra: R, expected: impl std::ops::RangeBounds) where R: RangeArgument, T: std::fmt::Debug + std::cmp::PartialEq { assert_eq!(ra.start(), expected.start_bound()); assert_eq!(ra.end(), expected.end_bound()); } #[test] fn unbounded() { accept::(.., ..) } #[test] fn up_to_inclusive() { accept::(..=2, ..=2) } #[test] fn up_to_exclusive() { accept::(..2, ..2) } #[test] fn from() { accept::(1.., 1..) } #[test] fn from_to_inclusive() { accept::(1..=2, 1..=2) } #[test] fn from_to_exclusive() { accept::(1..3, 1..3) } #[test] fn exactly() { accept::(42, 42..=42) } } ================================================ FILE: src/result.rs ================================================ use std::{ error, fmt::{self, Display}, }; /// Parser error. #[derive(Debug, PartialEq, Clone)] pub enum Error { Incomplete, Mismatch { message: String, position: usize, }, Conversion { message: String, position: usize, }, Expect { message: String, position: usize, inner: Box, }, Custom { message: String, position: usize, inner: Option>, }, } impl error::Error for Error { fn description(&self) -> &'static str { "Parse error" } } impl Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::Incomplete => write!(f, "Incomplete"), Self::Mismatch { ref message, ref position, } => write!(f, "Mismatch at {}: {}", position, message), Self::Conversion { ref message, ref position, } => write!(f, "Conversion failed at {}: {}", position, message), Self::Expect { ref message, ref position, ref inner, } => write!(f, "{} at {}: {}", message, position, inner), Self::Custom { ref message, ref position, inner: Some(ref inner), } => write!(f, "{} at {}, (inner: {})", message, position, inner), Self::Custom { ref message, ref position, inner: None, } => write!(f, "{} at {}", message, position), } } } /// Parser result, `Result` ia alias of `Result`. pub type Result = ::std::result::Result; ================================================ FILE: src/set.rs ================================================ use std::{ cmp::{PartialEq, PartialOrd}, ops::{Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}, str, }; /// Set relationship. pub trait Set { /// Whether a set contains an element or not. fn contains(&self, elem: &T) -> bool; /// Convert to text for display. fn to_str(&self) -> &str { "" } } impl Set for [T] { fn contains(&self, elem: &T) -> bool { (self as &[T]).contains(elem) } } impl Set for str { fn contains(&self, elem: &char) -> bool { (self as &str).contains(*elem) } fn to_str(&self) -> &str { self } } impl Set for Range { fn contains(&self, elem: &T) -> bool { self.start <= *elem && self.end > *elem } } impl Set for RangeFrom { fn contains(&self, elem: &T) -> bool { self.start <= *elem } } impl Set for RangeInclusive { fn contains(&self, elem: &T) -> bool { self.start() <= elem && self.end() >= elem } } impl Set for RangeTo { fn contains(&self, elem: &T) -> bool { self.end > *elem } } impl Set for RangeToInclusive { fn contains(&self, elem: &T) -> bool { self.end >= *elem } } impl Set for RangeFull { fn contains(&self, _: &T) -> bool { true } fn to_str(&self) -> &str { ".." } } impl Set for [u8; N] { fn contains(&self, elem: &u8) -> bool { (self as &[u8]).contains(elem) } fn to_str(&self) -> &str { str::from_utf8(self).unwrap_or("") } } #[cfg(test)] mod test { use crate::parser::*; #[test] fn one_of_using_set() { assert!(one_of(b"az").parse(b"a").is_ok()); assert!(one_of(b"az").parse(b"1").is_err()); } #[test] fn one_of_using_range() { assert!(one_of(&(b'a'..b'z')).parse(b"a").is_ok()); assert!(one_of(&(b'a'..b'z')).parse(b"z").is_err()); assert!(one_of(&(b'a'..b'z')).parse(b"1").is_err()); } #[test] fn one_of_using_range_to() { assert!(one_of(&(..b'z')).parse(b"a").is_ok()); assert!(one_of(&(..b'z')).parse(b"z").is_err()); assert!(one_of(&(..b'z')).parse(b"1").is_ok()); } #[test] fn one_of_using_range_inclusive() { assert!(one_of(&(b'a'..=b'z')).parse(b"a").is_ok()); assert!(one_of(&(b'a'..=b'z')).parse(b"z").is_ok()); assert!(one_of(&(b'a'..=b'z')).parse(b"1").is_err()); } #[test] fn one_of_using_range_to_inclusive() { assert!(one_of(&(..=b'z')).parse(b"a").is_ok()); assert!(one_of(&(..=b'z')).parse(b"z").is_ok()); assert!(one_of(&(..=b'z')).parse(b"1").is_ok()); } #[test] fn one_of_using_full_range() { assert!(one_of(&(..)).parse(b"a").is_ok()); assert!(one_of(&(..)).parse(b"z").is_ok()); assert!(one_of(&(..)).parse(b"1").is_ok()); } } ================================================ FILE: src/utf8.rs ================================================ // Variants of parser functions specialized for matching UTF-8 strings and returning chars use super::parser; use super::{Error, Result}; use crate::range::RangeArgument; use crate::set::Set; use bstr::decode_utf8; use std::fmt::Debug; use std::ops::{Add, BitOr, Mul, Neg, Not, Shr, Sub}; use std::str; // / Parser combinator. //type Parse<'a, O> = dyn Fn(&'a [u8], usize) -> Result<(O, usize)> + 'a; /// Being wrapped in this struct guarantees that the parser within will only match valid UTF-8 strings. pub struct Parser<'a, O>(parser::Parser<'a, u8, O>); impl<'a, O> Parser<'a, O> { /// Create new parser. pub fn new

(parse: P) -> Self where P: Fn(&'a [u8], usize) -> Result<(O, usize)> + 'a, { Self(parser::Parser::new(parse)) } /// Collect all matched input symbols. // This method is the primary reason utf8::Parser exists at all. pub fn collect(self) -> Parser<'a, &'a str> where O: 'a, { Parser(self.0.collect().map( // UNSAFE: Because we only could have constructed this object from other utf8::Parser objects, the match space must be valid UTF-8 |s| unsafe { str::from_utf8_unchecked(s) }, )) } // Remaining methods in impl only delegate to base parser::Parser /// Apply the parser to parse input. pub fn parse(&self, input: &'a [u8]) -> Result { self.0.parse(input) } /// Parse input at specified byte position. pub fn parse_at(&self, input: &'a [u8], start: usize) -> Result<(O, usize)> { self.0.parse_at(input, start) } /// Apply the parser to parse input. pub fn parse_str(&self, input: &'a str) -> Result { self.0.parse(input.as_bytes()) } /// Convert parser result to desired value. pub fn map(self, f: F) -> Parser<'a, U> where F: Fn(O) -> U + 'a, O: 'a, U: 'a, { Parser(self.0.map(f)) } /// Convert parser result to desired value, fail in case of conversion error. pub fn convert(self, f: F) -> Parser<'a, U> where F: Fn(O) -> ::std::result::Result + 'a, E: Debug, O: 'a, U: 'a, { Parser(self.0.convert(f)) } /// Cache parser output result to speed up backtracking. pub fn cache(self) -> Self where O: Clone + 'a, { Self(self.0.cache()) } /// Get input position after matching parser. pub fn pos(self) -> Parser<'a, usize> where O: 'a, { Parser(self.0.pos()) } /// Discard parser output. pub fn discard(self) -> Parser<'a, ()> where O: 'a, { Parser(self.0.discard()) } /// Make parser optional. pub fn opt(self) -> Parser<'a, Option> where O: 'a, { Parser(self.0.opt()) } /// `p.repeat(5)` repeat p exactly 5 times /// `p.repeat(0..)` repeat p zero or more times /// `p.repeat(1..)` repeat p one or more times /// `p.repeat(1..4)` match p at least 1 and at most 3 times pub fn repeat(self, range: R) -> Parser<'a, Vec> where R: RangeArgument + Debug + 'a, O: 'a, { Parser(self.0.repeat(range)) } /// Give parser a name to identify parsing errors. pub fn name(self, name: &'a str) -> Self where O: 'a, { Self(self.0.name(name)) } /// Mark parser as expected, abort early when failed in ordered choice. pub fn expect(self, name: &'a str) -> Self where O: 'a, { Self(self.0.expect(name)) } } impl<'a, O> From> for parser::Parser<'a, u8, O> { fn from(parser: Parser<'a, O>) -> Self { parser.0 // Simply unwrap } } pub fn decode(slice: &[u8], start: usize) -> Result<(char, usize)> { let (ch, size) = decode_utf8(&slice[start..]); let Some(ch) = ch else { return no_utf8(start, size); }; Ok((ch, size)) } // Helper for functions that decode_utf8 and fail fn no_utf8(start: usize, size: usize) -> Result { Err(Error::Mismatch { message: if size == 0 { "end of input reached" } else { "not UTF-8" } .to_owned(), position: start, }) } /// Match any UTF-8 character. pub fn any<'a>() -> Parser<'a, char> { Parser::new(|input: &[u8], start: usize| { let (ch, size) = decode(input, start)?; let pos = start + size; Ok((ch, pos)) }) } /// Match specific UTF-8 character. pub fn sym<'a>(tag: char) -> Parser<'a, char> { Parser::new(move |input: &[u8], start: usize| { let (ch, size) = decode(input, start)?; if ch != tag { return Err(Error::Mismatch { message: format!("expect: {}, found: {}", tag, ch), position: start, }); } let pos = start + size; Ok((ch, pos)) }) } /// Success when sequence of chars matches current input. pub fn seq<'a, 'b: 'a>(tag_str: &'b str) -> Parser<'a, &'a str> { let tag = tag_str.as_bytes(); Parser::new(move |input: &'a [u8], start: usize| { let mut index = 0; loop { let pos = start + index; if index == tag.len() { let result = &input[start..pos]; // UNSAFE: Because slice is byte-identical to a str, it is known valid UTF-8 let result_str = unsafe { str::from_utf8_unchecked(result) }; return Ok((result_str, pos)); } let Some(s) = input.get(pos) else { return Err(Error::Incomplete); }; if tag[index] != *s { return Err(Error::Mismatch { message: format!("seq {:?} at byte index: {}", tag, pos), position: pos, }); } index += 1; } }) } /// Success when current input symbol is one of the set. pub fn one_of<'a, S>(set: &'a S) -> Parser<'a, char> where S: Set + ?Sized, { Parser::new(move |input: &'a [u8], start: usize| { let (ch, size) = decode(input, start)?; if !set.contains(&ch) { return Err(Error::Mismatch { message: format!("expect one of: {}, found: {}", set.to_str(), ch), position: start, }); } let pos = start + size; Ok((ch, pos)) }) } /// Success when current input symbol is none of the set. pub fn none_of<'a, S>(set: &'a S) -> Parser<'a, char> where S: Set + ?Sized, { Parser::new(move |input: &'a [u8], start: usize| { let (ch, size) = decode(input, start)?; if set.contains(&ch) { return Err(Error::Mismatch { message: format!("expect one of: {}, found: {}", set.to_str(), ch), position: start, }); } let pos = start + size; Ok((ch, pos)) }) } /// Success when predicate returns true on current input symbol. pub fn is_a<'a, F>(predicate: F) -> Parser<'a, char> where F: Fn(char) -> bool + 'a, { Parser::new(move |input: &'a [u8], start: usize| { let (ch, size) = decode(input, start)?; if !predicate(ch) { return Err(Error::Mismatch { message: format!("is_a predicate failed on: {}", ch), position: start, }); } let pos = start + size; Ok((ch, pos)) }) } /// Success when predicate returns false on current input symbol. pub fn not_a<'a, F>(predicate: F) -> Parser<'a, char> where F: Fn(char) -> bool + 'a, { Parser::new(move |input: &'a [u8], start: usize| { let (ch, size) = decode(input, start)?; if predicate(ch) { return Err(Error::Mismatch { message: format!("is_a predicate failed on: {}", ch), position: start, }); } let pos = start + size; Ok((ch, pos)) }) } /// Read n chars. pub fn take<'a>(n: usize) -> Parser<'a, &'a str> { Parser::new(move |input: &'a [u8], start: usize| { let mut byte_pos = start; for _ in 0..n { let (ch, size) = decode_utf8(&input[start..]); if ch.is_none() { return no_utf8(byte_pos, size); } byte_pos += size; } let result = &input[start..byte_pos]; // UNSAFE: Because every char has been checked by decode_utf8, this string is known utf8 let result_str = unsafe { str::from_utf8_unchecked(result) }; Ok((result_str, byte_pos)) }) } /// Skip n symbols. pub fn skip<'a>(n: usize) -> Parser<'a, ()> { Parser::new(move |input: &'a [u8], start: usize| { let mut byte_pos = start; for _ in 0..n { let (ch, size) = decode_utf8(&input[start..]); if ch.is_none() { return no_utf8(byte_pos, size); } byte_pos += size; } Ok(((), byte_pos)) }) } /// Read n bytes exactly. pub fn take_bytes<'a>(n: usize) -> Parser<'a, &'a str> { Parser::new(move |input: &'a [u8], start: usize| { // FIXME: This runs in linear time because it checks each character. // If we could remember which inputs were passed in from parse_str() instead of parse(), // we could assume the characters are valid utf8 and run this in constant time by only checking // the final character using bstr::decode_last_utf8. let mut byte_pos = start; loop { let (ch, size) = decode_utf8(&input[start..]); if ch.is_none() { return no_utf8(byte_pos, size); } byte_pos += size; if byte_pos > n { return Err(Error::Mismatch { message: "range splits a UTF-8 character".to_owned(), position: start, }); } if byte_pos == n { let result = &input[start..byte_pos]; // UNSAFE: Because every char has been checked by decode_utf8, this string is known utf8 let result_str = unsafe { str::from_utf8_unchecked(result) }; return Ok((result_str, byte_pos)); } } }) } /// Skip n bytes exactly. pub fn skip_bytes<'a>(n: usize) -> Parser<'a, ()> { Parser::new(move |input: &'a [u8], start: usize| { // FIXME: See note on take_bytes. let mut byte_pos = start; loop { let (ch, size) = decode_utf8(&input[start..]); if ch.is_none() { return no_utf8(byte_pos, size); } byte_pos += size; if byte_pos > n { return Err(Error::Mismatch { message: "range splits a UTF-8 character".to_owned(), position: start, }); } if byte_pos == n { return Ok(((), byte_pos)); } } }) } /// Chain two parsers where the second parser depends on the first's result. impl<'a, O: 'a, U: 'a, F: Fn(O) -> Parser<'a, U> + 'a> Shr for Parser<'a, O> { type Output = Parser<'a, U>; fn shr(self, other: F) -> Self::Output { Parser::new(move |input: &'a [u8], start: usize| { (self.0.method)(input, start).and_then(|(out, pos)| (other(out).0.method)(input, pos)) }) } } // Note: There are no "degrade to parser::Parser" implementations for >> // because Rust cannot tell the difference between an FN(O)->U and an FN(O)->V. // Remaining functions in file only delegate to base parser::Parser /// Always succeeds, consume no input. pub fn empty<'a>() -> Parser<'a, ()> { Parser(parser::empty()) } /// Parse separated list. pub fn list<'a, O, U>(item: Parser<'a, O>, separator: Parser<'a, U>) -> Parser<'a, Vec> where O: 'a, U: 'a, { Parser(parser::list(item.0, separator.0)) } /// Call a parser factory, can be used to create recursive parsers. pub fn call<'a, O, F>(parser_factory: F) -> Parser<'a, O> where O: 'a, F: Fn() -> Parser<'a, O> + 'a, { Parser(parser::call(move || parser_factory().0)) } /// Success when end of input is reached. pub fn end<'a>() -> Parser<'a, ()> { Parser(parser::end()) } // And, Sub and Mul are similar enough we can implement them with macros macro_rules! utf_op { ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { #[doc=$doc] impl<'a, Left: 'a, Right: 'a> $impl_name> for Parser<'a, Left> { type Output = Parser<'a, $return_type>; fn $fn_name (self, other: Parser<'a, Right>) -> Self::Output { Parser(self.0 $op other.0) } } }; } macro_rules! utf_u8_op { ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { #[doc=concat!($doc, " (but degrade to non-utf8 parser)")] impl<'a, Left: 'a, Right: 'a> $impl_name> for Parser<'a, Left> { type Output = parser::Parser<'a, u8, $return_type>; fn $fn_name (self, other: parser::Parser<'a, u8, Right>) -> Self::Output { self.0 $op other } } }; } macro_rules! u8_utf_op { ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { #[doc=concat!($doc, " (but degrade to non-utf8 parser)")] impl<'a, Left: 'a, Right: 'a> $impl_name> for parser::Parser<'a, u8, Left> { type Output = parser::Parser<'a, u8, $return_type>; fn $fn_name (self, other: Parser<'a, Right>) -> Self::Output { self $op other.0 } } }; } macro_rules! all_op { ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { utf_op!($impl_name, $fn_name, $op, $return_type, $doc); utf_u8_op!($impl_name, $fn_name, $op, $return_type, $doc); u8_utf_op!($impl_name, $fn_name, $op, $return_type, $doc); }; } all_op!(Add, add, +, (Left, Right), "Sequence reserve value"); all_op!(Sub, sub, -, Left, "Sequence discard second value"); all_op!(Mul, mul, *, Right, "Sequence discard first value"); /// Ordered choice impl<'a, O: 'a> BitOr for Parser<'a, O> { type Output = Self; fn bitor(self, other: Self) -> Self { Self(self.0 | other.0) } } /// Ordered choice (but degrade to non-utf8 parser) impl<'a, O: 'a> BitOr> for Parser<'a, O> { type Output = parser::Parser<'a, u8, O>; fn bitor(self, other: parser::Parser<'a, u8, O>) -> Self::Output { self.0 | other } } /// Ordered choice (but degrade to non-utf8 parser) impl<'a, O: 'a> BitOr> for parser::Parser<'a, u8, O> { type Output = parser::Parser<'a, u8, O>; fn bitor(self, other: Parser<'a, O>) -> Self::Output { self | other.0 } } /// And predicate impl<'a, O: 'a> Neg for Parser<'a, O> { type Output = Parser<'a, bool>; fn neg(self) -> Self::Output { Parser(-self.0) } } /// Not predicate impl<'a, O: 'a> Not for Parser<'a, O> { type Output = Parser<'a, bool>; fn not(self) -> Self::Output { Parser(!self.0) } } ================================================ FILE: tests/list.rs ================================================ extern crate pom; use pom::parser::*; use pom::Parser; fn spaces() -> Parser { one_of(b" ").repeat(1..).discard() } fn works() -> Parser> { list(one_of(b"abc"), spaces() * seq(b"and") - spaces()) } fn dangle() -> Parser, &'static [u8])> { list(one_of(b"abc"), spaces() * seq(b"and") - spaces()) + seq(b" and") } #[test] fn test_list() { let one = b"a and b and c"; assert_eq!(works().parse(one), Ok(vec![b'a', b'b', b'c'])); let two = b"a and b and c and "; assert_eq!( dangle().parse(two), Ok((vec![b'a', b'b', b'c'], &b" and"[..])) ); }