Repository: zevv/npeg Branch: master Commit: 409f6796d0e8 Files: 42 Total size: 196.6 KB Directory structure: gitextract_c8vlvt__/ ├── .github/ │ └── workflows/ │ ├── ci.yml │ └── contents.yml ├── Changelog.md ├── INTERNALS.md ├── LICENSE ├── README.md ├── config.nims ├── doc/ │ ├── README.md │ └── papers/ │ └── README.md ├── misc/ │ ├── README │ ├── indent.nim │ ├── java.nim │ ├── mouse2npeg.nim │ └── rod.nim ├── npeg.nimble ├── src/ │ ├── npeg/ │ │ ├── capture.nim │ │ ├── codegen.nim │ │ ├── common.nim │ │ ├── dot.nim │ │ ├── grammar.nim │ │ ├── lib/ │ │ │ ├── core.nim │ │ │ ├── rfc3339.nim │ │ │ ├── types.nim │ │ │ ├── uri.nim │ │ │ └── utf8.nim │ │ ├── parsepatt.nim │ │ ├── patt.nim │ │ ├── railroad.nim │ │ └── stack.nim │ └── npeg.nim └── tests/ ├── basics.nim ├── captures.nim ├── config.nims ├── examples.nim ├── json-32M.bzip2 ├── lexparse.nim ├── lib.nim ├── nimversion.nim ├── performance.nim ├── precedence.nim ├── testdata └── tests.nim ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: push: pull_request: concurrency: group: ci-${{ github.ref }} cancel-in-progress: true jobs: build: strategy: fail-fast: false matrix: compiler: - name: nim version: devel - name: nim version: version-2-0 - name: nimskull version: "0.1.0-dev.21405" - name: nimskull version: "*" include: - compiler: name: nim version: devel build_doc: true name: ${{ matrix.compiler.name }} ${{ matrix.compiler.version }} runs-on: ubuntu-latest defaults: run: shell: bash working-directory: npeg steps: - name: Checkout uses: actions/checkout@v4.1.1 with: path: npeg - name: Setup Nim if: matrix.compiler.name == 'nim' uses: alaviss/setup-nim@0.1.1 with: path: nim version: ${{ matrix.compiler.version }} - name: Setup nimskull id: nimskull if: matrix.compiler.name == 'nimskull' uses: nim-works/setup-nimskull@0.1.1 with: nimskull-version: ${{ matrix.compiler.version }} - name: Run tests run: nim r --path:src tests/tests.nim - name: Build docs if: matrix.build_doc shell: bash run: | branch=$GITHUB_REF branch=${branch##*/} for i in src/npeg.nim src/npeg/*.nim; do nim doc --project --outdir:htmldocs \ --path:src \ "--git.url:https://github.com/$GITHUB_REPOSITORY" \ "--git.commit:$GITHUB_SHA" \ "--git.devel:$branch" \ "$i" done # Make npeg module the default page cp htmldocs/{npeg,index}.html - name: Upload GitHub Pages artifact if: matrix.build_doc uses: actions/upload-pages-artifact@v3.0.1 with: path: npeg/htmldocs deploy: needs: - build if: github.ref == 'refs/heads/master' permissions: actions: read pages: write id-token: write environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} name: Deploy docs to GitHub Pages runs-on: ubuntu-latest steps: - name: Deploy page id: deployment uses: actions/deploy-pages@v4.0.4 passed: needs: build if: failure() || cancelled() name: All tests passed runs-on: ubuntu-latest steps: - run: exit 1 ================================================ FILE: .github/workflows/contents.yml ================================================ name: Make table of contents on: push: paths: - README.md branches: - '**' jobs: make: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: thatrandomperson5/AutoMarkdownContents@v1.1.1 with: file: README.md auto: true skip-first: true - name: Pull request uses: peter-evans/create-pull-request@v4 with: token: ${{ secrets.GITHUB_TOKEN }} title: "Add md table of contents" commit-message: ":clipboard: Added markdown table of contents" body: | :clipboard: Added markdown table of contents base: ${{ github.head_ref }} # Creates pull request onto pull request or commit branch branch: actions/automd ================================================ FILE: Changelog.md ================================================ 1.3.0 - 2024-08-22 ================== - Added CI (thanks Leorize) - Performance optimization - Some small rafactorings 1.2.1 - 2023-03-04 ================== - fixes for --styleCheck=usages 1.2.0 - 2023-01-17 ================== - Split NPegException into more specific errors, updated documentation 1.1.2 - 2023-01-08 ================== - Fixed compat with Nim 1.0.11 1.1.1 - 2023-01-08 ================== - Disabled test with '∙' to avoid breaking older Nim 1.1.0 - 2023-01-08 ================== - Added alternate `∙` concatenation operator - Fixed fixBareExceptionWarning in Nim devel - Added table of contents to README.md 1.0.1 - 2022-12-10 ================== - Bugfix release, fixes "expression 'discard' has no type (or is ambiguous)" in rare cases 1.0.0 - 2022-11-27 ================== - Improved stack trace handling - Fixed matchFile() for empty files 0.27.0 - 2022-11-06 =================== - Augment the Nim stack trace with the NPeg return stack on exceptions - Documentation updates 0.26.0 - 2021-11-27 =================== - Improved lineinfo in code blocks for better backtraces - Some documentation improvements 0.25.0 - 2021-09-11 =================== - Omit the `.computedGoto.` in the inner parser loop for grammars with more then 10k instructions to work around the nim compiler limitation 0.24.1 - 2021-01-16 =================== - Added mixin for 'repr' to allow clean tracing of user types 0.24.0 - 2020-11-20 =================== - Added -d:npegGcsafe 0.23.2 - 2020-11-06 =================== - Small improvement in npeg systax checking 0.23.0 - 2020-09-23 =================== - Reinstated [] out of bound check for capturest - Dropped profiler support, the implementation was bad - Small documentation improvements - Added RFC3339 date parser to libs 0.22.2 - 2019-12-27 =================== - Skip --gc:arc tests for nim <1.1 to fix Nim CI builds. 0.22.1 - 2019-12-27 =================== - Bugfix in codegen causing problems with ^1 notation in code blocks. 0.22.0 - 2019-12-24 =================== - Changed the parsing subject from `openArray[char]` to `openArray[T]` and added a 'literal' atom to the grammar. This allows NPeg to parse lists of any type, making it suitable for separate lexer and parser stages. See tests/lexparse.nim for a concise example. - Added `@` syntactic sugar to access the match offset inside code block captures. - Dropped Json and AST captures - no complains heard since deprecation, and it simplifies the code base to aid the development new features. 0.21.3 - 2019-12-06 =================== - Fixed off-by-one error in range `P[m..n]` operator, which would also match `P` times `n+1` - Various documentation improvements 0.21.2 - 2019-11-26 =================== - Fixed the way dollar captures are rewritten to avoid the name space clash which was introduced by Nim PR #12712. 0.21.1 - 2019-11-19 =================== - Bugfix for templates generating ordered choices 0.21.0 - 2019-10-28 =================== - anonymous `patt` patterns now also take a code block - deprecated AST and Json captures. AST captures are not flexible enough, and the functionality can be better implemented using code block captures and domain-specific AST object types. The Json captures were added in the early days of NPeg as a flexible way to store captures, but this does not mix well with custom captures and can not handle things like string unescaping. Both capture types were removed from the documentation and a .deprecated. pragma was added to the implementation. If you use Json or AST captures and think deprecation is a mistake, let me know. 0.20.0 - 2019-10-18 =================== - Added precedence operators - this allows constructions of Pratt parsers with bounded left recursion and operator precedence. - Added run time profiler, enable with -d:npegProfile - Performance improvements 0.19.0 - 2019-10-11 =================== - Significant performance improvements - Changed semantincs of code block captures: $0 now always captures the total subject captured in a rule. This is a minor API change that only affects code using the `capture[]` notation inside code blocks - Added fail() function to force a parser fail in a code block capture - Added push() function to allow code block captures to push captures back on the stack - Check for loops caused by repeat of empty strings at compile time 0.18.0 - 2019-09-26 =================== - Runtime performance improvements 0.17.1 - 2019-09-19 =================== - Bugfix release (removed lingering debug echo) 0.17.0 - 2019-09-17 =================== - Various runtime and compiletime performance improvements 0.16.0 - 2019-09-08 =================== - Templates can now also be used in libraries - Added railroad diagram generation with -d:npegGraph - Improved error reporting 0.15.0 - 2019-08-31 =================== - Generic parser API changed: the peg() macro now explicity passes the userdata type and identifier. 0.14.1 - 2019-08-28 =================== - Added templates / parameterised rules - Added custom match validation in code block capture - Added basic types, utf8 and uri libs - Added global pattern library support - Proc matchFile() now uses memfiles/mmap for zero copy parsers - Implemented method to pass user variable to code block captures - Added AST capture type for building simple abstract syntax trees - Added Jb() capture for Json booleans 0.13.0 - 2019-07-21 =================== - The capture[] variable available inside code block matches now allows access to the match offset as well. This is an API change since the type of capture changed from seq[string] to seq[Capture]. 0.12.0 - 2019-07-14 =================== - Documentation updates - Made some error bounds compile-time configurable - Fix for more strict Nim compiler checks 0.11.0 - 2019-05-29 =================== - Added support for named backreferences - Added safeguards to prevent grammars growing out of bounds - Added Graphviz .dot debugging output for parser debugging - Added `matchLen` and `matchMax` fields to `NPegException` - Improved pattern syntax error messages 0.10.0 - 2019-04-24 =================== - Fixed 'Graph' character class 0.9.0 - 2019-03-31 ================== - Some syntax changes to fix compilation with mainline Nim 0.19.4 0.8.0 - 2019-03-30 ================== - Added syntactic sugar for accessing the captures[] seq in capture code blocks with dollar-number variables $1..$9 0.7.0 - 2019-03-29 ================== - Action callbacks (%) dropped in favour of Nim code block callbacks. 0.6.0 - 2019-03-27 ================== - API change: count syntax changed from {n} to [n]. - Optimizations in code generation 0.5.0 - 2019-03-27 ================== - API change: peg() and patt() now return an object of type Parser instead of a proc, and the function match(p: Parser) is now used for matching the subject. match() can match string and cstring types, matchFile() matches a file using memFile. - Added builtin atoms Upper, Lower, Digit, HexDigit, Alpha - Added `@` search operator - Added `&` and predicate 0.4.0 - 2019-03-24 ================== - Improved tracing output, during trace the originating rule name for each instruction is dumped. - Optimizations ================================================ FILE: INTERNALS.md ================================================ ## Introduction This document briefly describes the inner workings of NPeg. The main PEG algorithm is based on the Paper "A Text Pattern-Matching Tool based on Parsing Expression Grammars" by Roberto Ierusalimschy, who is also the author or LPEG. While LPEG uses a VM approach for parsing, NPeg adds an additional step where the VM code is compiled to native Nim code which does the parsing. This is how NPeg works in short: - The grammar is parsed by a Nim macro which recursively transforms this into a sequence of VM instructions for each grammar rule. - The set of instructions is 'linked' into a complete program of instructions - The linked program is translated/compiled into a state machine, implemented as a large Nim `case` statement that performs the parsing of the subject string. ## Data structures The following data structures are used for compiling the grammar: - `Inst`, short for "instruction": This is a object variant which implements a basic VM instruction. It consists of the opcode and a number of data fields. - `Patt`, short for "pattern": A pattern is a sequence of instructions `seq[Inst]` which typically match an atom from the grammar. - `Rule`: One complete, named pattern which is part of a grammar. - `Grammar`: A grammar is collection of named patterns implemented as a `table[string, Patt]`. This is used as the intermediate representation of the complete compiled grammar and holds patterns for each of the named rules. - `Program`: A complete linked program, consisting of a pattern and its debug info (symbol table, textual listing) - `Parser`: object holding the compiled Nim matching function For captures the following data structures are relevant: - `CapFrame`: A capframe is a frame of a specific type on the capture stack that points to an offset in the subject string. For each capture open and close pair a frame exists on the stack, thus allowing for nested captures. - `Capture`: A capture is a completed capture that is collected and finalized when a capture is closed and finished. For the generic procs and types, the following convention is used: - `[T]` is the type of optional "user data" the gets passed into the parser. When this is not explicitly given with the `peg` macro, NPeg will stub this with an unused bool - `[S]` is the type of the subject. This is typicall a string, although NPeg is generic enough and can parse any `seq[S]` ## Building a grammar The first step in building a parser is the translation of the grammar into snippets of VM instructions which match the data and perform flow control. For details of these instructions, refer to the paper by Ierusalimschy. The `Patt` data type is used to store a sequence of instructions. This section describe how a pattern is built from Nim code, all of which lives in `patt.nim` - this mechanism is later used by the macro which is parsing the actual PEG grammar. The basic atoms are constructed by the `newPatt()` procedures. These take an argument describing what needs to be matched in the subject, and deliver a short sequence of instructions. For example, the `newPatt("foo")` procedure will create a pattern consisting of a single instruction: ``` 1: line opStr "foo" ``` There are a number of operators defined which act on one or more patterns. These operators are used to combine multiple patterns into larger patters. For example, the `|` operator is used for the PEG ordered choice. This takes two patters, and results in a pattern that tries to match the first one and then skips the second, or tries to match the second if the first fails: ``` 0: line opChoice 3 1: line opStr "foo" 2: line opCommit 4 3: line opStr "bar" 4: opReturn ``` A number of patterns can be combined into a grammar, which is simply a table of patterns indexed by name. ## PEG DSL to grammar The user defines their NPeg grammar in a Nim code block, which consists of a number of named patterns. The whole grammar is handled by the `parseGrammar()` which iterates all individual named patterns. Each pattern is passed to the `parsePatt()` macro, which transforms the Nim code block AST into a NPeg grammar. This macro recursively goes through the Nim AST and calls `newPatt()` for building atoms, and calls the various operators acting on patterns to grow the grammar. ## Grammar to Nim code The `genCode()` procedure is used to convert the list of instructions into Nim code which implements the actual parser. This procedure builds a `case` statement for each VM instruction, and inserts a template for each opcode for each case. ## Example The following grammar is specified by the user: ``` lines <- *line line <- "foo" | "bar" ``` This is translated into the following VM program: ``` lines: 0: lines opChoice 3 1: lines opCall 4 line 2: lines opPartCommit 1 3: opReturn line: 4: line opChoice 7 5: line opStr "foo" 6: line opCommit 8 7: line opStr "bar" 8: opReturn ``` which is then translated into the following `case` statement: ``` while true: case ip of 0: opChoiceFn(3, "lines") of 1: opCallFn("line", 3, "lines") of 2: opPartCommitFn(1, "lines") of 3: opReturnFn("") of 4: opChoiceFn(7, "line") of 5: opStrFn("foo", "line") of 6: opCommitFn(8, "line") of 7: opStrFn("bar", "line") of 8: opReturnFn("") else: opFailFn() ``` ================================================ FILE: LICENSE ================================================ Copyright 2019 Ico Doornekamp Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) ![Stability: experimental](https://img.shields.io/badge/stability-stable-green.svg) NPeg logo > "_Because friends don't let friends write parsers by hand_" NPeg is a pure Nim pattern matching library. It provides macros to compile patterns and grammars (PEGs) to Nim procedures which will parse a string and collect selected parts of the input. PEGs are not unlike regular expressions, but offer more power and flexibility, and have less ambiguities. (More about PEGs on [Wikipedia](https://en.wikipedia.org/wiki/Parsing_expression_grammar)) ![Graph](/doc/syntax-diagram.png) Some use cases where NPeg is useful are configuration or data file parsers, robust protocol implementations, input validation, lexing of programming languages or domain specific languages. Some NPeg highlights: - Grammar definitions and Nim code can be freely mixed. Nim code is embedded using the normal Nim code block syntax, and does not disrupt the grammar definition. - NPeg-generated parsers can be used both at run and at compile time. - NPeg offers various methods for tracing, optimizing and debugging your parsers. - NPeg can parse sequences of any data types, also making it suitable as a stage-two parser for lexed tokens. - NPeg can draw [cool diagrams](/doc/example-railroad.png) ## Contents - [Quickstart](#quickstart) - [Usage](#usage) * [Simple patterns](#simple-patterns) * [Grammars](#grammars) - [Syntax](#syntax) * [Atoms](#atoms) * [Operators](#operators) - [Precedence operators](#precedence-operators) - [Captures](#captures) * [String captures](#string-captures) * [Code block captures](#code-block-captures) - [Custom match validations](#custom-match-validations) - [Passing state](#passing-state) * [Backreferences](#backreferences) - [More about grammars](#more-about-grammars) * [Ordering of rules in a grammar](#ordering-of-rules-in-a-grammar) * [Templates, or parameterized rules](#templates-or-parameterized-rules) * [Composing grammars with libraries](#composing-grammars-with-libraries) * [Library rule overriding/shadowing](#library-rule-overridingshadowing) - [Error handling](#error-handling) * [MatchResult](#matchresult) * [NpegParseError exceptions](#npegparseerror-exceptions) * [Other exceptions](#other-exceptions) * [Parser stack trace](#parser-stack-trace) - [Advanced topics](#advanced-topics) * [Parsing other types then strings](#parsing-other-types-then-strings) - [Some notes on using PEGs](#some-notes-on-using-pegs) * [Anchoring and searching](#anchoring-and-searching) * [Complexity and performance](#complexity-and-performance) * [End of string](#end-of-string) * [Non-consuming atoms and captures](#non-consuming-atoms-and-captures) * [Left recursion](#left-recursion) * [UTF-8 / Unicode](#utf-8--unicode) - [Tracing and debugging](#tracing-and-debugging) * [Syntax diagrams](#syntax-diagrams) * [Grammar graphs](#grammar-graphs) * [Tracing](#tracing) - [Compile-time configuration](#compile-time-configuration) - [Tracing and debugging](#tracing-and-debugging-1) - [Random stuff and frequently asked questions](#random-stuff-and-frequently-asked-questions) * [Why does NPeg not support regular PEG syntax?](#why-does-npeg-not-support-regular-peg-syntax) * [Can NPeg be used to parse EBNF grammars?](#can-npeg-be-used-to-parse-ebnf-grammars) * [NPeg and generic functions](#npeg-and-generic-functions) - [Examples](#examples) * [Parsing arithmetic expressions](#parsing-arithmetic-expressions) * [A complete JSON parser](#a-complete-json-parser) * [Captures](#captures-1) * [More examples](#more-examples) - [Future directions / Todos / Roadmap / The long run](#future-directions--todos--roadmap--the-long-run) ## Quickstart Here is a simple example showing the power of NPeg: The macro `peg` compiles a grammar definition into a `parser` object, which is used to match a string and place the key-value pairs into the Nim table `words`: ```nim import npeg, strutils, tables type Dict = Table[string, int] let parser = peg("pairs", d: Dict): pairs <- pair * *(',' * pair) * !1 word <- +Alpha number <- +Digit pair <- >word * '=' * >number: d[$1] = parseInt($2) var words: Dict doAssert parser.match("one=1,two=2,three=3,four=4", words).ok echo words ``` Output: ```nim {"two": 2, "three": 3, "one": 1, "four": 4} ``` A brief explanation of the above code: * The macro `peg` is used to create a parser object, which uses `pairs` as the initial grammar rule to match. The variable `d` of type `Dict` will be available inside the code block parser for storing the parsed data. * The rule `pairs` matches one `pair`, followed by zero or more times (`*`) a comma followed by a `pair`. * The rules `word` and `number` match a sequence of one or more (`+`) alphabetic characters or digits, respectively. The `Alpha` and `Digit` rules are pre-defined rules matching the character classes `{'A'..'Z','a'..'z'}` and `{'0'..'9'}`. * The rule `pair` matches a `word`, followed by an equals sign (`=`), followed by a `number`. * The `word` and `number` in the `pair` rule are captured with the `>` operator. The Nim code fragment below this rule is executed for every match, and stores the captured word and number in the `words` Nim table. ## Usage The `patt()` and `peg()` macros can be used to compile parser functions: - `patt()` creates a parser from a single anonymous pattern. - `peg()` allows the definition of a set of (potentially recursive) rules making up a complete grammar. The result of these macros is an object of the type `Parser` which can be used to parse a subject: ```nim proc match(p: Parser, s: string) = MatchResult proc matchFile(p: Parser, fname: string) = MatchResult ``` The above `match` functions returns an object of the type `MatchResult`: ```nim MatchResult = object ok: bool matchLen: int matchMax: int ... ``` * `ok`: A boolean indicating if the matching succeeded without error. Note that a successful match does not imply that *all of the subject* was matched, unless the pattern explicitly matches the end-of-string. * `matchLen`: The number of input bytes of the subject that successfully matched. * `matchMax`: The highest index into the subject that was reached during parsing, *even if matching was backtracked or did not succeed*. This offset is usually a good indication of the location where the matching error occurred. The string captures made during the parsing can be accessed with: ```nim proc captures(m: MatchResult): seq[string] ``` ### Simple patterns A simple pattern can be compiled with the `patt` macro. For example, the pattern below splits a string by white space: ```nim let parser = patt *(*' ' * > +(1-' ')) echo parser.match(" one two three ").captures ``` Output: ``` @["one", "two", "three"] ``` The `patt` macro can take an optional code block which is used as code block capture for the pattern: ```nim var key, val: string let p = patt >+Digit * "=" * >+Alpha: (key, val) = ($1, $2) assert p.match("15=fifteen").ok echo key, " = ", val ``` ### Grammars The `peg` macro provides a method to define (recursive) grammars. The first argument is the name of initial patterns, followed by a list of named patterns. Patterns can now refer to other patterns by name, allowing for recursion: ```nim let parser = peg "ident": lower <- {'a'..'z'} ident <- *lower doAssert parser.match("lowercaseword").ok ``` The order in which the grammar patterns are defined affects the generated parser. Although NPeg could always reorder, this is a design choice to give the user more control over the generated parser: * when a pattern `P1` refers to pattern `P2` which is defined *before* `P1`, `P2` will be inlined in `P1`. This increases the generated code size, but generally improves performance. * when a pattern `P1` refers to pattern `P2` which is defined *after* `P1`, `P2` will be generated as a subroutine which gets called from `P1`. This will reduce code size, but might also result in a slower parser. ## Syntax The NPeg syntax is similar to normal PEG notation, but some changes were made to allow the grammar to be properly parsed by the Nim compiler: - NPeg uses prefixes instead of suffixes for `*`, `+`, `-` and `?`. - Ordered choice uses `|` instead of `/` because of operator precedence. - The explicit `*` infix operator is used for sequences. NPeg patterns and grammars can be composed from the following parts: ```nim Atoms: 0 # matches always and consumes nothing 1 # matches any character n # matches exactly n characters 'x' # matches literal character 'x' "xyz" # matches literal string "xyz" i"xyz" # matches literal string, case insensitive {'x'..'y'} # matches any character in the range from 'x'..'y' {'x','y','z'} # matches any character from the set Operators: P1 * P2 # concatenation P1 | P2 # ordered choice P1 - P2 # matches P1 if P2 does not match (P) # grouping !P # matches everything but P &P # matches P without consuming input ?P # matches P zero or one times *P # matches P zero or more times +P # matches P one or more times @P # search for P P[n] # matches P n times P[m..n] # matches P m to n times Precedence operators: P ^ N # P is left associative with precedence N P ^^ N # P is right associative with precedence N String captures: >P # Captures the string matching P Back references: R("tag", P) # Create a named reference for pattern P R("tag") # Matches the given named reference Error handling: E"msg" # Raise an `NPegParseError` exception ``` In addition to the above, NPeg provides the following built-in shortcuts for common atoms, corresponding to POSIX character classes: ```nim Alnum <- {'A'..'Z','a'..'z','0'..'9'}, # Alphanumeric characters Alpha <- {'A'..'Z','a'..'z'}, # Alphabetic characters Blank <- {' ','\t'}, # Space and tab Cntrl <- {'\x00'..'\x1f','\x7f'}, # Control characters Digit <- {'0'..'9'}, # Digits Graph <- {'\x21'..'\x7e'}, # Visible characters Lower <- {'a'..'z'}, # Lowercase characters Print <- {'\x21'..'\x7e',' '}, # Visible characters and spaces Space <- {'\9'..'\13',' '}, # Whitespace characters Upper <- {'A'..'Z'}, # Uppercase characters Xdigit <- {'A'..'F','a'..'f','0'..'9'}, # Hexadecimal digits ``` ### Atoms Atoms are the basic building blocks for a grammar, describing the parts of the subject that should be matched. - Integer literal: `0` / `1` / `n` The int literal atom `n` matches exactly n number of bytes. `0` always matches, but does not consume any data. - Character and string literals: `'x'` / `"xyz"` / `i"xyz"` Characters and strings are literally matched. If a string is prefixed with `i`, it will be matched case insensitive. - Character sets: `{'x','y'}` Characters set notation is similar to native Nim. A set consists of zero or more comma separated characters or character ranges. ```nim {'x'..'y'} # matches any character in the range from 'x'..'y' {'x','y','z'} # matches any character from the set 'x', 'y', and 'z' ``` The set syntax `{}` is flexible and can take multiple ranges and characters in one expression, for example `{'0'..'9','a'..'f','A'..'F'}`. ### Operators NPeg provides various prefix and infix operators. These operators combine or transform one or more patterns into expressions, building larger patterns. - Concatenation: `P1 * P2` ``` o──[P1]───[P2]──o ``` The pattern `P1 * P2` returns a new pattern that matches only if first `P1` matches, followed by `P2`. For example, `"foo" * "bar"` would only match the string `"foobar"`. Note: As an alternative for the `*` asterisk, the unicode glyph `∙` ("bullet operator", 0x2219) can also be used for concatenation. - Ordered choice: `P1 | P2` ``` o─┬─[P1]─┬─o ╰─[P2]─╯ ``` The pattern `P1 | P2` tries to first match pattern `P1`. If this succeeds, matching will proceed without trying `P2`. Only if `P1` can not be matched, NPeg will backtrack and try to match `P2` instead. Once either `P1` or `P2` has matched, the choice will be final ("commited"), and no more backtracking will be possible for this choice. For example `("foo" | "bar") * "fizz"` would match both `"foofizz"` and `"barfizz"`. NPeg optimizes the `|` operator for characters and character sets: The pattern `'a' | 'b' | 'c'` will be rewritten to a character set `{'a','b','c'}`. - Difference: `P1 - P2` The pattern `P1 - P2` matches `P1` *only* if `P2` does not match. This is equivalent to `!P2 * P1`: ``` ━━━━ o──[P2]─»─[P1]──o ``` NPeg optimizes the `-` operator for characters and character sets: The pattern `{'a','b','c'} - 'b'` will be rewritten to the character set `{'a','c'}`. - Grouping: `(P)` Brackets are used to group patterns similar to normal arithmetic expressions. - Not-predicate: `!P` ``` ━━━ o──[P]──o ``` The pattern `!P` returns a pattern that matches only if the input does not match `P`. In contrast to most other patterns, this pattern does not consume any input. A common usage for this operator is the pattern `!1`, meaning "only succeed if there is not a single character left to match" - which is only true for the end of the string. - And-predicate: `&P` ``` ━━━ ━━━ o──[P]──o ``` The pattern `&P` matches only if the input matches `P`, but will *not* consume any input. This is equivalent to `!!P`. This is denoted by a double negation in the railroad diagram, which is not very pretty unfortunately. - Optional: `?P` ``` ╭──»──╮ o─┴─[P]─┴─o ``` The pattern `?P` matches if `P` can be matched zero or more times, so essentially succeeds if `P` either matches or not. For example, `?"foo" * bar"` matches both `"foobar"` and `"bar"`. - Match zero or more times: `*P` ``` ╭───»───╮ o─┴┬─[P]─┬┴─o ╰──«──╯ ``` The pattern `*P` tries to match as many occurrences of pattern `P` as possible - this operator always behaves *greedily*. For example, `*"foo" * "bar"` matches `"bar"`, `"fooboar"`, `"foofoobar"`, etc. - Match one or more times: `+P` ``` o─┬─[P]─┬─o ╰──«──╯ ``` The pattern `+P` matches `P` at least once, but also more times. It is equivalent to the `P * *P` - this operator always behave *greedily*. - Search: `@P` This operator searches for pattern `P` using an optimized implementation. It is equivalent to `s <- *(1 - P) * P`, which can be read as "try to match as many characters as possible not matching `P`, and then match `P`: ``` ╭─────»─────╮ │ ━━━ │ o─┴┬─[P]─»─1─┬┴»─[P]──o ╰────«────╯ ``` Note that this operator does not allow capturing the skipped data up to the match; if this is required you can manually construct a grammar to do this. - Match exactly `n` times: `P[n]` The pattern `P[n]` matches `P` exactly `n` times. For example, `"foo"[3]` only matches the string `"foofoofoo"`: ``` o──[P]─»─[P]─»─[P]──o ``` - Match `m` to `n` times: `P[m..n]` The pattern `P[m..n]` matches `P` at least `m` and at most `n` times. For example, `"foo[1,3]"` matches `"foo"`, `"foofoo"` and `"foofoofo"`: ``` ╭──»──╮ ╭──»──╮ o──[P]─»┴─[P]─┴»┴─[P]─┴─o ``` ## Precedence operators Note: This is an experimental feature, the implementation or API might change in the future. Precedence operators allows for the construction of "precedence climbing" or "Pratt parsers" with NPeg. The main use for this feature is building parsers for programming languages that follow the usual precedence and associativity rules of arithmetic expressions. - Left associative precedence of `N`: `P ^ N` ``` <1< o──[P]──o ``` - Right associative precedence of `N`: `P ^^ N` ``` >1> o──[P]──o ``` During parsing NPeg keeps track of the current precedence level of the parsed expression - the default is `0` if no precedence has been assigned yet. When the `^` operator is matched, either one of the next three cases applies: - `P ^ N` where `N > 0` and `N` is lower then the current precedence: in this case the current precedence is set to `N` and parsing of pattern `P` continues. - `P ^ N` where `N > 0` and `N` is higher or equal then the current precedence: parsing will fail and backtrack. - `P ^ 0`: resets the current precedence to 0 and continues parsing. This main use case for this is parsing sub-expressions in parentheses. The heart of a Pratt parser in NPeg would look something like this: ```nim exp <- prefix * *infix parenExp <- ( "(" * exp * ")" ) ^ 0 prefix <- number | parenExp infix <- {'+','-'} * exp ^ 1 | {'*','/'} * exp ^ 2 | {'^'} * exp ^^ 3: ``` More extensive documentation will be added later, for now take a look at the example in `tests/precedence.nim`. ## Captures ``` ╭╶╶╶╶╶╮ s o────[P]────o ╰╶╶╶╶╶╯ ``` NPeg supports a number of ways to capture data when parsing a string. The various capture methods are described here, including a concise example. The capture examples below build on the following small PEG, which parses a comma separated list of key-value pairs: ```nim const data = "one=1,two=2,three=3,four=4" let parser = peg "pairs": pairs <- pair * *(',' * pair) * !1 word <- +Alpha number <- +Digit pair <- word * '=' * number let r = parser.match(data) ``` ### String captures The basic method for capturing is marking parts of the peg with the capture prefix `>`. During parsing NPeg keeps track of all matches, properly discarding any matches which were invalidated by backtracking. Only when parsing has fully succeeded it creates a `seq[string]` of all matched parts, which is then returned in the `MatchData.captures` field. In the example, the `>` capture prefix is added to the `word` and `number` rules, causing the matched words and numbers to be appended to the result capture `seq[string]`: ```nim let parser = peg "pairs": pairs <- pair * *(',' * pair) * !1 word <- +Alpha number <- +Digit pair <- >word * '=' * >number let r = parser.match(data) ``` The resulting list of captures is now: ```nim @["one", "1", "two", "2", "three", "3", "four", "4"] ``` ### Code block captures Code block captures offer the most flexibility for accessing matched data in NPeg. This allows you to define a grammar with embedded Nim code for handling the data during parsing. Note that for code block captures, the Nim code gets executed during parsing, *even if the match is part of a pattern that fails and is later backtracked*. When a grammar rule ends with a colon `:`, the next indented block in the grammar is interpreted as Nim code, which gets executed when the rule has been matched. Any string captures that were made inside the rule are available to the Nim code in the injected variable `capture[]` of type `seq[Capture]`: ``` type Capture = object s*: string # The captured string si*: int # The index of the captured string in the subject ``` The total subject matched by the code block rule is available in `capture[0]` Any additional explicit `>` string captures made by the rule or any of its child rules will be available as `capture[1]`, `capture[2]`, ... For convenience there is syntactic sugar available in the code block capture blocks: - The variables `$0` to `$9` are rewritten to `capture[n].s` and can be used to access the captured strings. The `$` operator uses then usual Nim precedence, thus these variables might need parentheses or different ordering in some cases, for example `$1.parseInt` should be written as `parseInt($1)`. - The variables `@0` to `@9` are rewritten to `capture[n].si` and can be used to access the offset in the subject of the matched captures. Example: ```nim let p = peg foo: foo <- >(1 * >1) * 1: echo "$0 = ", $0 echo "$1 = ", $1 echo "$2 = ", $2 echo p.match("abc").ok ``` Will output ```nim $0 = abc $1 = ab $2 = b ``` Code block captures consume all embedded string captures, so these captures will no longer be available after matching. A code block capture can also produce captures by calling the `push(s: string)` function from the code block. Note that this is an experimental feature and that the API might change in future versions. The example has been extended to capture each word and number with the `>` string capture prefix. When the `pair` rule is matched, the attached code block is executed, which adds the parsed key and value to the `words` table. ```nim from strutils import parseInt var words = initTable[string, int]() let parser = peg "pairs": pairs <- pair * *(',' * pair) * !1 word <- +Alpha number <- +Digit pair <- >word * '=' * >number: words[$1] = parseInt($2) let r = parser.match(data) ``` After the parsing finished, the `words` table will now contain: ```nim {"two": 2, "three": 3, "one": 1, "four": 4} ``` #### Custom match validations Code block captures can be used for additional validation of a captured string: the code block can call the functions `fail()` or `validate(bool)` to indicate if the match should succeed or fail. Failing matches are handled as if the capture itself failed and will result in the usual backtracking. When the `fail()` or `validate()` functions are not called, the match will succeed implicitly. For example, the following rule will check if a passed number is a valid `uint8` number: ```nim uint8 <- >Digit[1..3]: let v = parseInt($a) validate v>=0 and v<=255 ``` The following grammar will cause the whole parse to fail when the `error` rule matches: ```nim error <- 0: fail() ``` Note: The Nim code block is running within the NPeg parser context and in theory could access to its internal state - this could be used to create custom validator/matcher functions that can inspect the subject string, do lookahead or lookback, and adjust the subject index to consume input. At the time of writing, NPeg lacks a formal API or interface for this though, and I am not sure yet what this should look like - If you are interested in doing this, contact me so we can discuss the details. #### Passing state NPeg allows passing of data of a specific type to the `match()` function, this value is then available inside code blocks as a variable. This mitigates the need for global variables for storing or retrieving data in access captures. The syntax for passing data in a grammar is: ``` peg(name, identifier: Type) ``` For example, the above parser can be rewritten as such: ```nim type Dict = Table[string, int] let parser = peg("pairs", userdata: Dict): pairs <- pair * *(',' * pair) * !1 word <- +Alpha number <- +Digit pair <- >word * '=' * >number: userdata[$1] = parseInt($2) var words: Dict let r = parser.match(data, words) ``` ### Backreferences Backreferences allow NPeg to match an exact string that matched earlier in the grammar. This can be useful to match repetitions of the same word, or for example to match so called here-documents in programming languages. For this, NPeg offers the `R` operator with the following two uses: * The `R(name, P)` pattern creates a named reference for pattern `P` which can be referred to by name in other places in the grammar. * The pattern `R(name)` matches the contents of the named reference that earlier been stored with `R(name, P)` pattern. For example, the following rule will match only a string which will have the same character in the first and last position: ``` patt R("c", 1) * *(1 - R("c")) * R("c") * !1 ``` The first part of the rule `R("c", 1)` will match any character, and store this in the named reference `c`. The second part will match a sequence of zero or more characters that do not match reference `c`, followed by reference `c`. ## More about grammars ### Ordering of rules in a grammar Repetitive inlining of rules might cause a grammar to grow too large, resulting in a huge executable size and slow compilation. NPeg tries to mitigate this in two ways: * Patterns that are too large will not be inlined, even if the above ordering rules apply. * NPeg checks the size of the total grammar, and if it thinks it is too large it will fail compilation with the error message `NPeg: grammar too complex`. Check the section "Compile-time configuration" below for more details about too complex grammars. The parser size and performance depends on many factors; when performance and/or code size matters, it pays to experiment with different orderings and measure the results. When in doubt, check the generated parser instructions by compiling with the `-d:npegTrace` or `-d:npegDotDir` flags - see the section Tracing and Debugging for more information. At this time the upper limit is 4096 rules, this might become a configurable number in a future release. For example, the following grammar will not compile because recursive inlining will cause it to expand to a parser with more then 4^6 = 4096 rules: ``` let p = peg "z": f <- 1 e <- f * f * f * f d <- e * e * e * e c <- d * d * d * d b <- c * c * c * c a <- b * b * b * b z <- a * a * a * a ``` The fix is to change the order of the rules so that instead of inlining NPeg will use a calling mechanism: ``` let p = peg "z": z <- a * a * a * a a <- b * b * b * b b <- c * c * c * c c <- d * d * d * d d <- e * e * e * e e <- f * f * f * f f <- 1 ``` When in doubt check the generated parser instructions by compiling with the `-d:npegTrace` flag - see the section Tracing and Debugging for more information. ### Templates, or parameterized rules When building more complex grammars you may find yourself duplicating certain constructs in patterns over and over again. To avoid code repetition (DRY), NPeg provides a simple mechanism to allow the creation of parameterized rules. In good Nim-fashion these rules are called "templates". Templates are defined just like normal rules, but have a list of arguments, which are referred to in the rule. Technically, templates just perform a basic search-and-replace operation: every occurrence of a named argument is replaced by the exact pattern passed to the template when called. For example, consider the following grammar: ```nim numberList <- +Digit * *( ',' * +Digit) wordList <- +Alpha * *( ',' * +Alpha) ``` This snippet uses a common pattern twice for matching lists: `p * *( ',' * p)`. This matches pattern `p`, followed by zero or more occurrences of a comma followed by pattern `p`. For example, `numberList` will match the string `1,22,3`. The above example can be parameterized with a template like this: ```nim commaList(item) <- item * *( ',' * item ) numberList <- commaList(+Digit) wordList <- commaList(+Alpha) ``` Here the template `commaList` is defined, and any occurrence of its argument 'item' will be replaced with the patterns passed when calling the template. This template is used to define the more complex patterns `numberList` and `wordList`. Templates may invoke other templates recursively; for example the above can even be further generalized: ```nim list(item, sep) <- item * *( sep * item ) commaList(item) <- list(item, ',') numberList <- commaList(+Digit) wordList <- commaList(+Alpha) ``` ### Composing grammars with libraries For simple grammars it is usually fine to build all patterns from scratch from atoms and operators, but for more complex grammars it makes sense to define reusable patterns as basic building blocks. For this, NPeg keeps track of a global library of patterns and templates. The `grammar` macro can be used to add rules or templates to this library. All patterns in the library will be stored with a *qualified* identifier in the form `libraryname.patternname`, by which they can be referred to at a later time. For example, the following fragment defines three rules in the library with the name `number`. The rules will be stored in the global library and are referred to in the peg by their qualified names `number.dec`, `number.hex` and `number.oct`: ```nim grammar "number": dec <- {'1'..'9'} * *{'0'..'9'} hex <- i"0x" * +{'0'..'9','a'..'f','A'..'F'} oct <- '0' * *{'0'..'9'} let p = peg "line": line <- int * *("," * int) int <- number.dec | number.hex | number.oct let r = p.match("123,0x42,0644") ``` NPeg offers a number of pre-defined libraries for your convenience, these can be found in the `npeg/lib` directory. A library an be imported with the regular Nim `import` statement, all rules defined in the imported file will then be added to NPeg's global pattern library. For example: ```nim import npeg/lib/uri ``` Note that templates defined in libraries do not implicitly bind the the rules from that grammar; instead, you need to explicitly qualify the rules used in the template to refer to the grammar. For example: ```nim grammar "foo": open <- "(" close <- ")" inBrackets(body): foo.open * body * foo.close ``` ### Library rule overriding/shadowing To allow the user to add custom captures to imported grammars or rules, it is possible to *override* or *shadow* an existing rule in a grammar. Overriding will replace the rule from the library with the provided new rule, allowing the caller to change parts of an imported grammar. A overridden rule is allowed to reference the original rule by name, which will cause the new rule to *shadow* the original rule. This will effectively rename the original rule and replace it with the newly defined rule which will call the original referred rule. For example, the following snippet will reuse the grammar from the `uri` library and capture some parts of the URI in a Nim object: ```nim import npeg/lib/uri type Uri = object host: string scheme: string path: string port: int var myUri: Uri let parser = peg "line": line <- uri.URI uri.scheme <- >uri.scheme: myUri.scheme = $1 uri.host <- >uri.host: myUri.host = $1 uri.port <- >uri.port: myUri.port = parseInt($1) uri.path <- >uri.path: myUri.path = $1 echo parser.match("http://nim-lang.org:8080/one/two/three") echo myUri # --> (host: "nim-lang.org", scheme: "http", path: "/one/two/three", port: 8080) ``` ## Error handling NPeg offers a number of ways to handle errors during parsing a subject string; what method best suits your parser depends on your requirements. ### MatchResult The most simple way to handle errors is to inspect the `MatchResult` object that is returned by the `match()` proc: ```nim MatchResult = object ok: bool matchLen: int matchMax: int ``` The `ok` field in the `MatchResult` indicates if the parser was successful: when the complete pattern has been matched this value will be set to `true`, if the complete pattern did not match the subject the value will be `false`. In addition to the `ok` field, the `matchMax` field indicates the maximum offset into the subject the parser was able to match the string. If the matching succeeded `matchMax` equals the total length of the subject, if the matching failed, the value of `matchMax` is usually a good indication of where in the subject string the error occurred: ``` let a = patt 4 let r = a.match("123") if not r.ok: echo "Parsing failed at position ", r.matchMax ``` ### NpegParseError exceptions When, during matching, the parser reaches an `E"message"` atom in the grammar, NPeg will raise an `NPegParseError` exception with the given message. The typical use case for this atom is to be combine with the ordered choice `|` operator to generate helpful error messages. The following example illustrates this: ```nim let parser = peg "list": list <- word * *(comma * word) * !1 word <- +Alpha | E"expected word" comma <- ',' | E"expected comma" try: echo parser.match("one,two;three") except NPegParseError as e: echo "Parsing failed at position ", e.matchMax, ": ", e.msg ``` The rule `comma` tries to match the literal `','`. If this can not be matched, the rule `E"expected comma"` will match instead, where `E` will raise an `NPegParseError` exception. The `NPegParseError` type contains the same two fields as `MatchResult` to indicate where in the subject string the match failed: `matchLen` and `matchMax`, which can be used as an indication of the location of the parse error: ``` Parsing failed at position 7: expected comma ``` ### Other exceptions NPeg can raise a number of other exception types during parsing: - `NPegParseError`: described in the previous section - `NPegStackOverflowError`: a stack overflow occured in the backtrace or call stack; this is usually an indication of a faulty or too complex grammar. - `NPegUnknownBackrefError`: An unknown back reference identifier is used in an `R()` rule. - `NPegCaptureOutOfRangeError`: A code block capture tries to access a capture that is not available using the `$` notation or by accessing the `capture[]` seq. All the above errors are inherited from the generic `NPegException` object. ### Parser stack trace If an exception is raised from within an NPeg parser - either by the `E` atom or by nim code in a code block capture - NPeg will augment the Nim stack trace with frames indicating where in the grammar the exception occured. The above example will generate the following stack trace, note the last two entries which are added by NPeg and show the rules in which the exception occured: ``` /tmp/list.nim(9) list ./npeg/src/npeg.nim(142) match ./npeg/src/npeg.nim(135) match /tmp/flop.nim(4) list <- word * *(comma * word) * eof /tmp/flop.nim(7) word <- +{'a' .. 'z'} | E"expected word" Error: unhandled exception: Parsing error at #14: "expected word" [NPegParseError] ``` Note: this requires Nim 'devel' or version > 1.6.x; on older versions you can use `-d:npegStackTrace` to make NPeg dump the stack to stdout. ## Advanced topics ### Parsing other types then strings Note: This is an experimental feature, the implementation or API might change in the future. NPeg was originally designed to parse strings like a regular PEG engine, but has since evolved into a generic parser that can parse any subject of type `openArray[T]`. This section describes how to use this feature. - The `peg()` macro must be passed an additional argument specifying the base type `T` of the subject; the generated parser will then parse a subject of type `openArray[T]`. When not given, the default type is `char`, and the parser parsers `openArray[char]`, or more typically, `string`. - When matching non-strings, some of the usual atoms like strings or character sets do not make sense in a grammar, instead the grammar uses literal atoms. Literals can be specified in square brackets and are interpreted as any Nim code: `[foo]`, `[1+1]` or `["foo"]` are all valid literals. - When matching non-strings, captures will be limited to only a single element of the base type, as this makes more sense when parsing a token stream. For an example of this feature check the example in `tests/lexparse.nim` - this implements a classic parser with separate lexing and parsing stages. ## Some notes on using PEGs ### Anchoring and searching Unlike regular expressions, PEGs are always matched in *anchored* mode only: the defined pattern is matched from the start of the subject string. For example, the pattern `"bar"` does not match the string `"foobar"`. To search for a pattern in a stream, a construct like this can be used: ```nim p <- "bar" search <- p | 1 * search ``` The above grammar first tries to match pattern `p`, or if that fails, matches any character `1` and recurs back to itself. Because searching is a common operation, NPeg provides the builtin `@P` operator for this. ### Complexity and performance Although it is possible to write patterns with exponential time complexity for NPeg, they are much less common than in regular expressions, thanks to the limited backtracking. In particular, patterns written without grammatical rules always have a worst-case time `O(n^k)` (and space `O(k)`, which is constant for a given pattern), where `k` is the pattern's star height. Moreover, NPeg has a simple and clear performance model that allows programmers to understand and predict the time complexity of their patterns. The model also provides a firm basis for pattern optimizations. (Adapted from Ierusalimschy, "A Text Pattern-Matching Tool based on Parsing Expression Grammars", 2008) ### End of string PEGs do not care what is in the subject string after the matching succeeds. For example, the rule `"foo"` happily matches the string `"foobar"`. To make sure the pattern matches the end of string, this has to be made explicit in the pattern. The idiomatic notation for this is `!1`, meaning "only succeed if there is not a single character left to match" - which is only true for the end of the string. ### Non-consuming atoms and captures The lookahead(`&`) and not(`!`) operators may not consume any input, and make sure that after matching the internal parsing state of the parser is reset to as is was before the operator was started, including the state of the captures. This means that any captures made inside a `&` and `!` block also are discarded. It is possible however to capture the contents of a non-consuming block with a code block capture, as these are _always_ executed, even when the parser state is rolled back afterwards. ### Left recursion NPeg does not support left recursion (this applies to PEGs in general). For example, the rule ```nim A <- A | 'a' ``` will cause an infinite loop because it allows for left-recursion of the non-terminal `A`. Similarly, the grammar ```nim A <- B | 'a' A B <- A ``` is problematic because it is mutually left-recursive through the non-terminal `B`. Note that loops of patterns that can match the empty string will not result in the expected behavior. For example, the rule `*0` will cause the parser to stall and go into an infinite loop. ### UTF-8 / Unicode NPeg has no built-in support for Unicode or UTF-8, instead is simply able to parse UTF-8 documents just as like any other string. NPeg comes with a simple UTF-8 grammar library which should simplify common operations like matching a single code point or character class. The following grammar splits an UTF-8 document into separate characters/glyphs by using the `utf8.any` rule: ```nim import npeg/lib/utf8 let p = peg "line": line <- +char char <- >utf8.any let r = p.match("γνωρίζω") echo r.captures() # --> @["γ", "ν", "ω", "ρ", "ί", "ζ", "ω"] ``` ## Tracing and debugging ### Syntax diagrams When compiled with `-d:npegGraph`, NPeg will dump [syntax diagrams](https://en.wikipedia.org/wiki/Syntax_diagram) (also known as railroad diagrams) for all parsed rules. Syntax diagrams are sometimes helpful to understand or debug a grammar, or to get more insight in a grammars' complexity. ``` ╭─────────»──────────╮ │ ╭─────»──────╮│ ╭╶╶╶╶╶╶╶╶╶╶╮ │ │ ━━━━ ││ ╭╶╶╶╶╶╶╶╮ inf o──"INF:"─»───[number]───»┴─","─»┴┬─[lf]─»─1─┬┴┴»─[lf]─»───[url]────o ╰╶╶╶╶╶╶╶╶╶╶╯ ╰────«─────╯ ╰╶╶╶╶╶╶╶╯ ``` * Optionals (`?`) are indicated by a forward arrow overhead. * Repeats ('+') are indicated by a backwards arrow underneath. * Literals (strings, chars, sets) are printed in purple. * Non-terminals are printed in cyan between square brackets. * Not-predicates (`!`) are overlined in red. Note that the diagram does not make it clear that the input for not-predicates is not consumed. * Captures are boxed in a gray rectangle, optionally including the capture name. [Here](/doc/example-railroad.png) is a a larger example of an URL parser. ### Grammar graphs NPeg can generate a graphical representation of a grammar to show the relations between rules. The generated output is a `.dot` file which can be processed by the Graphviz tool to generate an actual image file. When compiled with `-d:npegDotDir=`, NPeg will generate a `.dot` file for each grammar in the code and write it to the given directory. ![graph](/doc/example-graph.png) * Edge colors represent the rule relation: grey=inline, blue=call, green=builtin * Rule colors represent the relative size/complexity of a rule: black=<10, orange=10..100, red=>100 Large rules result in larger generated code and slow compile times. Rule size can generally be decreased by changing the rule order in a grammar to allow NPeg to call rules instead of inlining them. ### Tracing When compiled with `-d:npegTrace`, NPeg will dump its intermediate representation of the compiled PEG, and will dump a trace of the execution during matching. These traces can be used for debugging or optimization of a grammar. For example, the following program: ```nim let parser = peg "line": space <- ' ' line <- word * *(space * word) word <- +{'a'..'z'} discard parser.match("one two") ``` will output the following intermediate representation at compile time. From the IR it can be seen that the `space` rule has been inlined in the `line` rule, but that the `word` rule has been emitted as a subroutine which gets called from `line`: ``` line: 0: line opCall 6 word word 1: line opChoice 5 *(space * word) 2: space opStr " " ' ' 3: line opCall 6 word word 4: line opPartCommit 2 *(space * word) 5: opReturn word: 6: word opSet '{'a'..'z'}' {'a' .. 'z'} 7: word opSpan '{'a'..'z'}' +{'a' .. 'z'} 8: opReturn ``` At runtime, the following trace is generated. The trace consists of a number of columns: 1. The current instruction pointer, which maps to the compile time dump. 2. The index into the subject. 3. The substring of the subject. 4. The name of the rule from which this instruction originated. 5. The instruction being executed. 6. The backtrace stack depth. ``` 0| 0|one two |line |call -> word:6 | 6| 0|one two |word |set {'a'..'z'} | 7| 1|ne two |word |span {'a'..'z'} | 8| 3| two | |return | 1| 3| two |line |choice -> 5 | 2| 3| two | space |chr " " |* 3| 4|two |line |call -> word:6 |* 6| 4|two |word |set {'a'..'z'} |* 7| 5|wo |word |span {'a'..'z'} |* 8| 7| | |return |* 4| 7| |line |pcommit -> 2 |* 2| 7| | space |chr " " |* | 7| | |fail |* 5| 7| | |return (done) | ``` The exact meaning of the IR instructions is not discussed here. ## Compile-time configuration NPeg has a number of configurable setting which can be configured at compile time by passing flags to the compiler. The default values should be ok in most cases, but if you ever run into one of those limits you are free to configure those to your liking: * `-d:npegPattMaxLen=N` This is the maximum allowed length of NPeg's internal representation of a parser, before it gets translated to Nim code. The reason to check for an upper limit is that some grammars can grow exponentially by inlining of patterns, resulting in slow compile times and oversized executable size. (default: 4096) * `-d:npegInlineMaxLen=N` This is the maximum allowed length of a pattern to be inlined. Inlining generally results in a faster parser, but also increases code size. It is valid to set this value to 0; in that case NPeg will never inline patterns and use a calling mechanism instead, this will result in the smallest code size. (default: 50) * `-d:npegRetStackSize=N` Maximum allowed depth of the return stack for the parser. The default value should be high enough for practical purposes, the stack depth is only limited to detect invalid grammars. (default: 1024) * `-d:npegBackStackSize=N` Maximum allowed depth of the backtrace stack for the parser. The default value should be high enough for practical purposes, the stack depth is only limited to detect invalid grammars. (default: 1024) * `-d:npegGcsafe` This is a workaround for the case where NPeg needs to be used from a `{.gcsafe.}` context when using threads. This will mark the generated matching function to be `{.gcsafe.}`. ## Tracing and debugging NPeg has a number of compile time flags to enable tracing and debugging of the generated parser: * `-d:npegTrace`: Enable compile time and run time tracing. Please refer to the section 'Tracing' for more details. * `-d:npegGraph`: Dump syntax diagrams of all parsed rules at compile time. These flags are meant for debugging NPeg itself, and are typically not useful to the end user: * `-d:npegDebug`: Enable more debug info. Meant for NPeg development debugging purposes only. * `-d:npegExpand`: Dump the generated Nim code for all parsers defined in the program. Meant for NPeg development debugging purposes only. * `-d:npegStacktrace`: When enabled, NPeg will dump a stack trace of the current position in the parser when an exception is thrown by NPeg itself or by Nim code in code block captures. ## Random stuff and frequently asked questions ### Why does NPeg not support regular PEG syntax? The NPeg syntax is similar, but not exactly the same as the official PEG syntax: it uses some different operators, and prefix instead of postfix operators. The reason for this is that the NPeg grammar is parsed by a Nim macro in order to allow code block captures to embed Nim code, which puts some limitations on the available syntax. Also, NPeg's operators are chosen so that they have the right precedence for PEGs. The result is that the grammer itself is expressed as valid Nim, which has the nice side effect of allowing syntax highlighting and code completion work with your favorite editor. ### Can NPeg be used to parse EBNF grammars? Almost, but not quite. Although PEGS and EBNF look quite similar, there are some subtle but important differences which do not allow a literal translation from EBNF to PEG. Notable differences are left recursion and ordered choice. Also, see "From EBNF to PEG" from Roman R. Redziejowski. ### NPeg and generic functions Nim's macro system is sometimes finicky and not well defined, and NPeg seems to push it to the limit. This means that you might run into strange and unexpected issues, especially when mixing NPeg with generic code. If you run into weird error messages that do not seem to make sense when using NPeg from generic procs, check the links below for more information and possible workarounds: - https://github.com/nim-lang/Nim/issues/22740 - https://github.com/zevv/npeg/issues/68 ## Examples ### Parsing arithmetic expressions ```nim let parser = peg "line": exp <- term * *( ('+'|'-') * term) term <- factor * *( ('*'|'/') * factor) factor <- +{'0'..'9'} | ('(' * exp * ')') line <- exp * !1 doAssert parser.match("3*(4+15)+2").ok ``` ### A complete JSON parser The following PEG defines a complete parser for the JSON language - it will not produce any captures, but simple traverse and validate the document: ```nim let s = peg "doc": S <- *Space jtrue <- "true" jfalse <- "false" jnull <- "null" unicodeEscape <- 'u' * Xdigit[4] escape <- '\\' * ({ '{', '"', '|', '\\', 'b', 'f', 'n', 'r', 't' } | unicodeEscape) stringBody <- ?escape * *( +( {'\x20'..'\xff'} - {'"'} - {'\\'}) * *escape) jstring <- ?S * '"' * stringBody * '"' * ?S minus <- '-' intPart <- '0' | (Digit-'0') * *Digit fractPart <- "." * +Digit expPart <- ( 'e' | 'E' ) * ?( '+' | '-' ) * +Digit jnumber <- ?minus * intPart * ?fractPart * ?expPart doc <- JSON * !1 JSON <- ?S * ( jnumber | jobject | jarray | jstring | jtrue | jfalse | jnull ) * ?S jobject <- '{' * ( jstring * ":" * JSON * *( "," * jstring * ":" * JSON ) | ?S ) * "}" jarray <- "[" * ( JSON * *( "," * JSON ) | ?S ) * "]" doAssert s.match(json).ok let doc = """ {"jsonrpc": "2.0", "method": "subtract", "params": [42, 23], "id": 1} """ doAssert parser.match(doc).ok ``` ### Captures The following example shows how to use code block captures. The defined grammar will parse a HTTP response document and extract structured data from the document into a Nim object: ```nim import npeg, strutils, tables type Request = object proto: string version: string code: int message: string headers: Table[string, string] # HTTP grammar (simplified) let parser = peg("http", userdata: Request): space <- ' ' crlf <- '\n' * ?'\r' url <- +(Alpha | Digit | '/' | '_' | '.') eof <- !1 header_name <- +(Alpha | '-') header_val <- +(1-{'\n'}-{'\r'}) proto <- >+Alpha: userdata.proto = $1 version <- >(+Digit * '.' * +Digit): userdata.version = $1 code <- >+Digit: userdata.code = parseInt($1) msg <- >(+(1 - '\r' - '\n')): userdata.message = $1 header <- >header_name * ": " * >header_val: userdata.headers[$1] = $2 response <- proto * '/' * version * space * code * space * msg headers <- *(header * crlf) http <- response * crlf * headers * eof # Parse the data and print the resulting table const data = """ HTTP/1.1 301 Moved Permanently Content-Length: 162 Content-Type: text/html Location: https://nim.org/ """ var request: Request let res = parser.match(data, request) echo request ``` The resulting data: ```nim ( proto: "HTTP", version: "1.1", code: 301, message: "Moved Permanently", headers: { "Content-Length": "162", "Content-Type": "text/html", "Location": "https://nim.org/" } ) ``` ### More examples More examples can be found in tests/examples.nim. ## Future directions / Todos / Roadmap / The long run Here are some things I'd like to have implemented one day. Some are hard and require me to better understand what I'm doing first. In no particular order: - Handling left recursion: PEGs are typically not good at handling grammar invoking left recursion, see https://en.wikipedia.org/wiki/Parsing_expression_grammar#Indirect_left_recursion for an explanation of the problem. However, some smart people have found a way to make this work anyway, but I am not yet able to understand this well enough to implement this in NPeg. https://github.com/zevv/npeg/blob/master/doc/papers/Left_recursion_in_parsing_expression_grammars.pdf - Design and implement a proper API for code block captures. The current API feels fragile and fragmented (`capture[], $1/$2, fail(), validate()`), and does not offer solid primitives to make custom match functions yet, something better should be in place before NPeg goes v1.0. - Resuming/streaming: The current parser is almost ready to be invoked multiple times, resuming parsing where it left off - this should allow parsing of (infinite) streams. The only problem not solved yet is how to handle captures: when a block of data is parsed it might contain data which must later be available to collect the capture. Not sure how to handle this yet. - Memoization: I guess it would be possible to add (limited) memoization to improve performance, but no clue where to start yet. - Parallelization: I wonder if parsing can parallelized: when reaching an ordered choice, multiple threads should be able to try to parse each individual choice. I do see problems with captures here, though. - I'm not happy about the `{.gcsafe.}` workaround. I'd be happy to hear any ideas on how to improve this. ================================================ FILE: config.nims ================================================ --styleCheck:usages if (NimMajor, NimMinor) < (1, 6): --styleCheck:hint else: --styleCheck:error ================================================ FILE: doc/README.md ================================================ This directory contains various papers which were used for inspiration when building Npeg. ================================================ FILE: doc/papers/README.md ================================================ This is a collection of papers somehow relevant to NPeg. ================================================ FILE: misc/README ================================================ This directory contains various snippets, examples or other helpful things that I want to keep around but do not fit in elsewhere. ================================================ FILE: misc/indent.nim ================================================ # Indent syntax let data = """ a=123 b= c=567 e=42 f=18 g= b=44 c=22 """ var indentStack = @[""] template top[T](s: seq[T]): T = s[s.high] let p = peg doc: doc <- pairs * !1 pairs <- pair * *('\n' * pair) pair <- indSame * key * '=' * val indentPairs <- '\n' * &indIn * pairs * &('\n' * indOut) key <- +Alpha: echo "key ", $0 number <- +Digit: echo "val ", $0 val <- number | indentPairs indSame <- *' ': validate $0 == indentStack.top indIn <- *' ': validate len($0) > len(indentStack.top) indentStack.add $0 indOut <- *' ': discard indentStack.pop validate $0 == indentStack.top echo p.match(data).ok ================================================ FILE: misc/java.nim ================================================ # # This grammar has been auto-generated with mouse2npeg from the Mouse Java-1.6 # grammar at http://www.romanredz.se/Mouse/Java.1.6.peg. It is not nice to look # at, but it does parse Java # import npeg let r = peg CompilationUnit: CompilationUnit <- Spacing * ?PackageDeclaration * *ImportDeclaration * *TypeDeclaration * EOT PackageDeclaration <- *Annotation * PACKAGE * QualifiedIdentifier * SEMI ImportDeclaration <- IMPORT * ?STATIC * QualifiedIdentifier * ?( DOT * STAR ) * SEMI TypeDeclaration <- *Modifier * ( ClassDeclaration | EnumDeclaration | InterfaceDeclaration | AnnotationTypeDeclaration ) | SEMI ClassDeclaration <- CLASS * Identifier * ?TypeParameters * ?( EXTENDS * ClassType ) * ?( IMPLEMENTS * ClassTypeList ) * ClassBody ClassBody <- LWING * *ClassBodyDeclaration * RWING ClassBodyDeclaration <- SEMI | ?STATIC * Block | *Modifier * MemberDecl MemberDecl <- TypeParameters * GenericMethodOrConstructorRest | Type * Identifier * MethodDeclaratorRest | Type * VariableDeclarators * SEMI | VOID * Identifier * VoidMethodDeclaratorRest | Identifier * ConstructorDeclaratorRest | InterfaceDeclaration | ClassDeclaration | EnumDeclaration | AnnotationTypeDeclaration GenericMethodOrConstructorRest <- ( Type | VOID ) * Identifier * MethodDeclaratorRest | Identifier * ConstructorDeclaratorRest MethodDeclaratorRest <- FormalParameters * *Dim * ?( THROWS * ClassTypeList ) * ( MethodBody | SEMI ) VoidMethodDeclaratorRest <- FormalParameters * ?( THROWS * ClassTypeList ) * ( MethodBody | SEMI ) ConstructorDeclaratorRest <- FormalParameters * ?( THROWS * ClassTypeList ) * MethodBody MethodBody <- Block InterfaceDeclaration <- INTERFACE * Identifier * ?TypeParameters * ?( EXTENDS * ClassTypeList ) * InterfaceBody InterfaceBody <- LWING * *InterfaceBodyDeclaration * RWING InterfaceBodyDeclaration <- *Modifier * InterfaceMemberDecl | SEMI InterfaceMemberDecl <- InterfaceMethodOrFieldDecl | InterfaceGenericMethodDecl | VOID * Identifier * VoidInterfaceMethodDeclaratorRest | InterfaceDeclaration | AnnotationTypeDeclaration | ClassDeclaration | EnumDeclaration InterfaceMethodOrFieldDecl <- Type * Identifier * InterfaceMethodOrFieldRest InterfaceMethodOrFieldRest <- ConstantDeclaratorsRest * SEMI | InterfaceMethodDeclaratorRest InterfaceMethodDeclaratorRest <- FormalParameters * *Dim * ?( THROWS * ClassTypeList ) * SEMI InterfaceGenericMethodDecl <- TypeParameters * ( Type | VOID ) * Identifier * InterfaceMethodDeclaratorRest VoidInterfaceMethodDeclaratorRest <- FormalParameters * ?( THROWS * ClassTypeList ) * SEMI ConstantDeclaratorsRest <- ConstantDeclaratorRest * *( COMMA * ConstantDeclarator ) ConstantDeclarator <- Identifier * ConstantDeclaratorRest ConstantDeclaratorRest <- *Dim * EQU * VariableInitializer EnumDeclaration <- ENUM * Identifier * ?( IMPLEMENTS * ClassTypeList ) * EnumBody EnumBody <- LWING * ?EnumConstants * ?COMMA * ?EnumBodyDeclarations * RWING EnumConstants <- EnumConstant * *( COMMA * EnumConstant ) EnumConstant <- *Annotation * Identifier * ?Arguments * ?ClassBody EnumBodyDeclarations <- SEMI * *ClassBodyDeclaration LocalVariableDeclarationStatement <- *( FINAL | Annotation ) * Type * VariableDeclarators * SEMI VariableDeclarators <- VariableDeclarator * *( COMMA * VariableDeclarator ) VariableDeclarator <- Identifier * *Dim * ?( EQU * VariableInitializer ) FormalParameters <- LPAR * ?FormalParameterList * RPAR FormalParameter <- *( FINAL | Annotation ) * Type * VariableDeclaratorId LastFormalParameter <- *( FINAL | Annotation ) * Type * ELLIPSIS * VariableDeclaratorId FormalParameterList <- FormalParameter * *( COMMA * FormalParameter ) * ?( COMMA * LastFormalParameter ) | LastFormalParameter VariableDeclaratorId <- Identifier * *Dim Block <- LWING * BlockStatements * RWING BlockStatements <- *BlockStatement BlockStatement <- LocalVariableDeclarationStatement | *Modifier * ( ClassDeclaration | EnumDeclaration ) | Statement Statement <- Block | ASSERT * Expression * ?( COLON * Expression ) * SEMI | IF * ParExpression * Statement * ?( ELSE * Statement ) | FOR * LPAR * ?ForInit * SEMI * ?Expression * SEMI * ?ForUpdate * RPAR * Statement | FOR * LPAR * FormalParameter * COLON * Expression * RPAR * Statement | WHILE * ParExpression * Statement | DO * Statement * WHILE * ParExpression * SEMI | TRY * Block * ( +Catch * ?Finally | Finally ) | SWITCH * ParExpression * LWING * SwitchBlockStatementGroups * RWING | SYNCHRONIZED * ParExpression * Block | RETURN * ?Expression * SEMI | THROW * Expression * SEMI | BREAK * ?Identifier * SEMI | CONTINUE * ?Identifier * SEMI | SEMI | StatementExpression * SEMI | Identifier * COLON * Statement Catch <- CATCH * LPAR * FormalParameter * RPAR * Block Finally <- FINALLY * Block SwitchBlockStatementGroups <- *SwitchBlockStatementGroup SwitchBlockStatementGroup <- SwitchLabel * BlockStatements SwitchLabel <- CASE * ConstantExpression * COLON | CASE * EnumConstantName * COLON | DEFAULT * COLON ForInit <- *( FINAL | Annotation ) * Type * VariableDeclarators | StatementExpression * *( COMMA * StatementExpression ) ForUpdate <- StatementExpression * *( COMMA * StatementExpression ) EnumConstantName <- Identifier StatementExpression <- Expression ConstantExpression <- Expression Expression <- ConditionalExpression * *( AssignmentOperator * ConditionalExpression ) AssignmentOperator <- EQU | PLUSEQU | MINUSEQU | STAREQU | DIVEQU | ANDEQU | OREQU | HATEQU | MODEQU | SLEQU | SREQU | BSREQU ConditionalExpression <- ConditionalOrExpression * *( QUERY * Expression * COLON * ConditionalOrExpression ) ConditionalOrExpression <- ConditionalAndExpression * *( OROR * ConditionalAndExpression ) ConditionalAndExpression <- InclusiveOrExpression * *( ANDAND * InclusiveOrExpression ) InclusiveOrExpression <- ExclusiveOrExpression * *( OR * ExclusiveOrExpression ) ExclusiveOrExpression <- AndExpression * *( HAT * AndExpression ) AndExpression <- EqualityExpression * *( AND * EqualityExpression ) EqualityExpression <- RelationalExpression * *( ( EQUAL | NOTEQUAL ) * RelationalExpression ) RelationalExpression <- ShiftExpression * *( ( LE | GE | LT | GT ) * ShiftExpression | INSTANCEOF * ReferenceType ) ShiftExpression <- AdditiveExpression * *( ( SL | SR | BSR ) * AdditiveExpression ) AdditiveExpression <- MultiplicativeExpression * *( ( PLUS | MINUS ) * MultiplicativeExpression ) MultiplicativeExpression <- UnaryExpression * *( ( STAR | DIV | MOD ) * UnaryExpression ) UnaryExpression <- PrefixOp * UnaryExpression | LPAR * Type * RPAR * UnaryExpression | Primary * *( Selector ) * *( PostfixOp ) Primary <- ParExpression | NonWildcardTypeArguments * ( ExplicitGenericInvocationSuffix | THIS * Arguments ) | THIS * ?Arguments | SUPER * SuperSuffix | Literal | NEW * Creator | QualifiedIdentifier * ?IdentifierSuffix | BasicType * *Dim * DOT * CLASS | VOID * DOT * CLASS IdentifierSuffix <- LBRK * ( RBRK * *Dim * DOT * CLASS | Expression * RBRK ) | Arguments | DOT * ( CLASS | ExplicitGenericInvocation | THIS | SUPER * Arguments | NEW * ?NonWildcardTypeArguments * InnerCreator ) ExplicitGenericInvocation <- NonWildcardTypeArguments * ExplicitGenericInvocationSuffix NonWildcardTypeArguments <- LPOINT * ReferenceType * *( COMMA * ReferenceType ) * RPOINT ExplicitGenericInvocationSuffix <- SUPER * SuperSuffix | Identifier * Arguments PrefixOp <- INC | DEC | BANG | TILDA | PLUS | MINUS PostfixOp <- INC | DEC Selector <- DOT * Identifier * ?Arguments | DOT * ExplicitGenericInvocation | DOT * THIS | DOT * SUPER * SuperSuffix | DOT * NEW * ?NonWildcardTypeArguments * InnerCreator | DimExpr SuperSuffix <- Arguments | DOT * ?NonWildcardTypeArguments * Identifier * ?Arguments BasicType <- ( "byte" | "short" | "char" | "int" | "long" | "float" | "double" | "boolean" ) * !LetterOrDigit * Spacing Arguments <- LPAR * ?( Expression * *( COMMA * Expression ) ) * RPAR Creator <- ?NonWildcardTypeArguments * CreatedName * ClassCreatorRest | ?NonWildcardTypeArguments * ( ClassType | BasicType ) * ArrayCreatorRest CreatedName <- Identifier * ?NonWildcardTypeArguments * *( DOT * Identifier * ?NonWildcardTypeArguments ) InnerCreator <- Identifier * ClassCreatorRest ArrayCreatorRest <- LBRK * ( RBRK * *Dim * ArrayInitializer | Expression * RBRK * *DimExpr * *Dim ) ClassCreatorRest <- Arguments * ?ClassBody ArrayInitializer <- LWING * ?( VariableInitializer * *( COMMA * VariableInitializer ) ) * ?COMMA * RWING VariableInitializer <- ArrayInitializer | Expression ParExpression <- LPAR * Expression * RPAR QualifiedIdentifier <- Identifier * *( DOT * Identifier ) Dim <- LBRK * RBRK DimExpr <- LBRK * Expression * RBRK Type <- ( BasicType | ClassType ) * *Dim ReferenceType <- BasicType * +Dim | ClassType * *Dim ClassType <- Identifier * ?TypeArguments * *( DOT * Identifier * ?TypeArguments ) ClassTypeList <- ClassType * *( COMMA * ClassType ) TypeArguments <- LPOINT * TypeArgument * *( COMMA * TypeArgument ) * RPOINT TypeArgument <- ReferenceType | QUERY * ?( ( EXTENDS | SUPER ) * ReferenceType ) TypeParameters <- LPOINT * TypeParameter * *( COMMA * TypeParameter ) * RPOINT TypeParameter <- Identifier * ?( EXTENDS * Bound ) Bound <- ClassType * *( AND * ClassType ) Modifier <- Annotation | ( "public" | "protected" | "private" | "static" | "abstract" | "final" | "native" | "synchronized" | "transient" | "volatile" | "strictfp" ) * !LetterOrDigit * Spacing AnnotationTypeDeclaration <- AT * INTERFACE * Identifier * AnnotationTypeBody AnnotationTypeBody <- LWING * *AnnotationTypeElementDeclaration * RWING AnnotationTypeElementDeclaration <- *Modifier * AnnotationTypeElementRest | SEMI AnnotationTypeElementRest <- Type * AnnotationMethodOrConstantRest * SEMI | ClassDeclaration | EnumDeclaration | InterfaceDeclaration | AnnotationTypeDeclaration AnnotationMethodOrConstantRest <- AnnotationMethodRest | AnnotationConstantRest AnnotationMethodRest <- Identifier * LPAR * RPAR * ?DefaultValue AnnotationConstantRest <- VariableDeclarators DefaultValue <- DEFAULT * ElementValue Annotation <- NormalAnnotation | SingleElementAnnotation | MarkerAnnotation NormalAnnotation <- AT * QualifiedIdentifier * LPAR * ?ElementValuePairs * RPAR SingleElementAnnotation <- AT * QualifiedIdentifier * LPAR * ElementValue * RPAR MarkerAnnotation <- AT * QualifiedIdentifier ElementValuePairs <- ElementValuePair * *( COMMA * ElementValuePair ) ElementValuePair <- Identifier * EQU * ElementValue ElementValue <- ConditionalExpression | Annotation | ElementValueArrayInitializer ElementValueArrayInitializer <- LWING * ?ElementValues * ?COMMA * RWING ElementValues <- ElementValue * *( COMMA * ElementValue ) Spacing <- *( +{' ','\t','\r','\n','\x0c'} | "/*" * *( !"*/" * 1 ) * "*/" | "//" * *( !{'\r','\n'} * 1 ) * {'\r','\n'} ) Identifier <- !Keyword * Letter * *LetterOrDigit * Spacing Letter <- {'a'..'z'} | {'A'..'Z'} | {'_','$'} LetterOrDigit <- {'a'..'z'} | {'A'..'Z'} | {'0'..'9'} | {'_','$'} Keyword <- ( "abstract" | "assert" | "boolean" | "break" | "byte" | "case" | "catch" | "char" | "class" | "const" | "continue" | "default" | "double" | "do" | "else" | "enum" | "extends" | "false" | "finally" | "final" | "float" | "for" | "goto" | "if" | "implements" | "import" | "interface" | "int" | "instanceof" | "long" | "native" | "new" | "null" | "package" | "private" | "protected" | "public" | "return" | "short" | "static" | "strictfp" | "super" | "switch" | "synchronized" | "this" | "throws" | "throw" | "transient" | "true" | "try" | "void" | "volatile" | "while" ) * !LetterOrDigit ASSERT <- "assert" * !LetterOrDigit * Spacing BREAK <- "break" * !LetterOrDigit * Spacing CASE <- "case" * !LetterOrDigit * Spacing CATCH <- "catch" * !LetterOrDigit * Spacing CLASS <- "class" * !LetterOrDigit * Spacing CONTINUE <- "continue" * !LetterOrDigit * Spacing DEFAULT <- "default" * !LetterOrDigit * Spacing DO <- "do" * !LetterOrDigit * Spacing ELSE <- "else" * !LetterOrDigit * Spacing ENUM <- "enum" * !LetterOrDigit * Spacing EXTENDS <- "extends" * !LetterOrDigit * Spacing FINALLY <- "finally" * !LetterOrDigit * Spacing FINAL <- "final" * !LetterOrDigit * Spacing FOR <- "for" * !LetterOrDigit * Spacing IF <- "if" * !LetterOrDigit * Spacing IMPLEMENTS <- "implements" * !LetterOrDigit * Spacing IMPORT <- "import" * !LetterOrDigit * Spacing INTERFACE <- "interface" * !LetterOrDigit * Spacing INSTANCEOF <- "instanceof" * !LetterOrDigit * Spacing NEW <- "new" * !LetterOrDigit * Spacing PACKAGE <- "package" * !LetterOrDigit * Spacing RETURN <- "return" * !LetterOrDigit * Spacing STATIC <- "static" * !LetterOrDigit * Spacing SUPER <- "super" * !LetterOrDigit * Spacing SWITCH <- "switch" * !LetterOrDigit * Spacing SYNCHRONIZED <- "synchronized" * !LetterOrDigit * Spacing THIS <- "this" * !LetterOrDigit * Spacing THROWS <- "throws" * !LetterOrDigit * Spacing THROW <- "throw" * !LetterOrDigit * Spacing TRY <- "try" * !LetterOrDigit * Spacing VOID <- "void" * !LetterOrDigit * Spacing WHILE <- "while" * !LetterOrDigit * Spacing Literal <- ( FloatLiteral | IntegerLiteral | CharLiteral | StringLiteral | "true" * !LetterOrDigit | "false" * !LetterOrDigit | "null" * !LetterOrDigit ) * Spacing IntegerLiteral <- ( HexNumeral | OctalNumeral | DecimalNumeral ) * ?{'l','L'} DecimalNumeral <- "0" | {'1'..'9'} * *{'0'..'9'} HexNumeral <- ( "0x" | "0X" ) * +HexDigit HexDigit <- {'a'..'f'} | {'A'..'F'} | {'0'..'9'} OctalNumeral <- "0" * +{'0'..'7'} FloatLiteral <- HexFloat | DecimalFloat DecimalFloat <- +Digit * "." * *Digit * ?Exponent * ?{'f','F','d','D'} | "." * +Digit * ?Exponent * ?{'f','F','d','D'} | +Digit * Exponent * ?{'f','F','d','D'} | +Digit * ?Exponent * {'f','F','d','D'} Exponent <- {'e','E'} * ?{'+','\\','-'} * +Digit Digit <- {'0'..'9'} HexFloat <- HexSignificand * BinaryExponent * ?{'f','F','d','D'} HexSignificand <- ( "0x" | "0X" ) * *HexDigit * "." * +HexDigit | HexNumeral * ?"." BinaryExponent <- {'p','P'} * ?{'+','\\','-'} * +Digit CharLiteral <- "\'" * ( Escape | !{'\'','\\','\n','\r'} * 1 ) * "\'" StringLiteral <- "\"" * *( Escape | !{'"','\\','\n','\r'} * 1 ) * "\"" Escape <- "\\" * ( {'b','t','n','f','r','"','\'','\\'} | OctalEscape | UnicodeEscape ) OctalEscape <- {'0'..'3'} * {'0'..'7'} * {'0'..'7'} | {'0'..'7'} * {'0'..'7'} | {'0'..'7'} UnicodeEscape <- +"u" * HexDigit * HexDigit * HexDigit * HexDigit AT <- "@" * Spacing AND <- "&" * !{'=','&'} * Spacing ANDAND <- "&&" * Spacing ANDEQU <- "&=" * Spacing BANG <- "!" * !"=" * Spacing BSR <- ">>>" * !"=" * Spacing BSREQU <- ">>>=" * Spacing COLON <- ":" * Spacing COMMA <- "," * Spacing DEC <- "--" * Spacing DIV <- "/" * !"=" * Spacing DIVEQU <- "/=" * Spacing DOT <- "." * Spacing ELLIPSIS <- "..." * Spacing EQU <- "=" * !"=" * Spacing EQUAL <- "==" * Spacing GE <- ">=" * Spacing GT <- ">" * !{'=','>'} * Spacing HAT <- "^" * !"=" * Spacing HATEQU <- "^=" * Spacing INC <- "++" * Spacing LBRK <- "[" * Spacing LE <- "<=" * Spacing LPAR <- "(" * Spacing LPOINT <- "<" * Spacing LT <- "<" * !{'=','<'} * Spacing LWING <- "{" * Spacing MINUS <- "-" * !{'=','\\','-'} * Spacing MINUSEQU <- "-=" * Spacing MOD <- "%" * !"=" * Spacing MODEQU <- "%=" * Spacing NOTEQUAL <- "!=" * Spacing OR <- "|" * !{'=','|'} * Spacing OREQU <- "|=" * Spacing OROR <- "||" * Spacing PLUS <- "+" * !{'=','+'} * Spacing PLUSEQU <- "+=" * Spacing QUERY <- "?" * Spacing RBRK <- "]" * Spacing RPAR <- ")" * Spacing RPOINT <- ">" * Spacing RWING <- "}" * Spacing SEMI <- ";" * Spacing SL <- "<<" * !"=" * Spacing SLEQU <- "<<=" * Spacing SR <- ">>" * !{'=','>'} * Spacing SREQU <- ">>=" * Spacing STAR <- "*" * !"=" * Spacing STAREQU <- "*=" * Spacing TILDA <- "~" * Spacing EOT <- !1 ================================================ FILE: misc/mouse2npeg.nim ================================================ # # Convert a Mouse PEG grammar into NPeg grammar # http://www.romanredz.se/Mouse/ # import npeg import npeg/common import strutils # Parse the Mouse grammar into an ASTNode tree let mouse = peg "mouse": mouse <- A("mouse", *rule) * ?s * !1 rule <- ?s * A("rule", >name * s * "=" * s * patt) patt <- A("patt", choice * ?sem * s * ';') sem <- ('{' * @'}') choice <- A("choice", seq * s * *('/' * s * seq)) seq <- A("seq", prefixed * *(s * prefixed) * s) nonterm <- A("nonterm", >name) prefixed <- A("pre", ?>'!' * postfixed) postfixed <- A("post", (paren | nonterm | lit) * >?postfix) lit <- any | range | set | string any <- A("any", '_') range <- A("range", '[' * >(char * '-' * char) * ']') set <- A("set", '[' * +(char-']') * ']') string <- A("string", '"' * +(char-'"') * '"') paren <- A("paren", '(' * s * choice * s * ')') postfix <- {'+','*','?'} name <- +Alpha char <- A("char", >( ("\\u" * Xdigit[4]) | ('\\' * {'\\','r','n','t','"'}) | 1)) nl <- {'\r','\n'} s <- *( +Space | comment | sem ) comment <- "//" * >*(1-nl) # Dump the PEG ast tree into NPeg form proc dump(a: ASTNode): string = proc unescapeChar(s: string): string = if s == "'": result = "\\'" elif s == "\\": result = "\\\\" elif s.len == 6: result = $(parseHexInt(s[2..5]).char.escapeChar) else: result = s case a.id: of "mouse": for c in a: result.add dump(c) of "rule": return " " & $a.val & " <- " & dump(a["patt"]) & "\n" of "patt": return dump a[0] of "choice": var parts: seq[string] for c in a: parts.add dump(c) return parts.join(" | ") of "seq": var parts: seq[string] for c in a: parts.add dump(c) return parts.join(" * ") of "paren": return "( " & dump(a[0]) & " )" of "pre": return a.val & dump(a[0]) of "post": return a.val & dump(a[0]) of "nonterm": return a.val of "any": return "1" of "string": result.add '"' for c in a: result.add unescapeChar(c.val) result.add '"' of "set": var cs: seq[string] for c in a: cs.add unescapeChar(c.val) return "{'" & cs.join("','") & "'}" of "range": return "{'" & escapeChar(a.val[0]) & "'..'" & escapeChar(a.val[2]) & "'}" else: echo "\nUnhnandled " & a.id quit 1 # http://www.romanredz.se/Mouse/Java.1.6.peg let r = mouse.matchFile("/tmp/Java.1.6.peg") if not r.ok: echo "Error parsing at ", r.matchMax quit 1 echo "import npeg" echo "let r = peg CompilationUnit:" echo dump(r.capturesAst()) ================================================ FILE: misc/rod.nim ================================================ import npeg import strutils # Rod AST node types type NodeKind* = enum nkEmpty nkScript, nkBlock nkBool, nkNumber, nkString, nkIdent nkPrefix, nkInfix, nkDot, nkIndex nkVar, nkLet nkIf, nkWhile, nkFor nkBreak, nkContinue nkCall nkGeneric nkObject, nkObjFields, nkObjConstr Node* = ref object ln*, col*: int file*: string case kind*: NodeKind of nkEmpty: discard of nkBool: boolVal*: bool of nkNumber: numberVal*: float of nkString: stringVal*: string of nkIdent: ident*: string else: children*: seq[Node] type ParseStack = seq[Node] # Pretty printing proc `$`*(node: Node, showLineInfo = false): string = const LeafNodes = { nkEmpty, nkBool, nkNumber, nkString, nkIdent, nkPrefix, nkInfix } case node.kind of nkEmpty: result = "" of nkBool: result = $node.boolVal of nkNumber: result = $node.numberVal of nkString: result = escape(node.stringVal) of nkIdent: result = node.ident else: result = (if showLineInfo: $node.ln & ":" & $node.col & " " else: "") & "(" & (case node.kind of nkPrefix, nkInfix: "" else: $node.kind & " ") for i, child in node.children: if child.kind notin LeafNodes and node.children.len > 1: result.add("\n") result.add(indent(`$`(child, showLineInfo), 2)) else: if i > 0: result.add(" ") result.add(`$`(child, showLineInfo)) result.add(")") proc `$`*(ps: ParseStack): string = for i, n in ps: result &= $i & ":\n" & $n & "\n" result &= "\n" proc addToParent(ps: var ParseStack, ns: varargs[Node]) = ps[ps.high].children.add ns proc swap(ps: var ParseStack) = ps.add ps[ps.high-1] ps.delete ps.high-2 let p = peg(rod, ps: ParseStack): S <- *Space # Basic tokens tokColon <- ":" * S tokEquals <- "=" * S tokComma <- "," * S tokPlus <- "+" * S tokMinus <- "-" * S tokMul <- "*" * S tokDiv <- "/" * S tokParOpen <- "(" * S tokParClose <- ")" * S tokCurOpen <- "{" * S tokCurClose <- "}" * S tokVar <- "var" * S tokLet <- "let" * S tokIf <- "if" * S tokElif <- "elif" * S tokElse <- "else" * S tokWhile <- "while" * S tokObject <- "object" * S keyWords <- "var" | "let" | "if" | "elif" | "else" | "while" | "object" # Atoms tokNumber <- >+Digit * S: ps.add Node(kind: nkNumber, numberVal: parseFloat($1)) tokType <- Alpha * *Alnum * S tokBool <- >("true" | "false") * S: ps.add Node(kind: nkBool, boolval: $1 == "true") tokIdent <- >((Alpha * *Alnum) - keyWords) * S: ps.add Node(kind: nkIdent, ident: $1) # Block blockOpen <- tokCurOpen: ps.add Node(kind: nkBlock) blockStmt <- stmt: ps.addToParent ps.pop() blockSec <- blockOpen * *blockStmt * tokCurClose # Var section varOpen <- (tokVar | tokLet): ps.add Node(kind: nkVar) varDef <- tokIdent * ?(tokColon * tokType) * ?(tokEquals * exprSec): ps.swap() ps.addToParent Node(kind: nkVar, children: @[Node(kind: nkIdent, ident: "="), ps.pop(), ps.pop()]) varSec <- varOpen * +varDef * *(tokComma * varDef): ps.add ps.pop() # While statement whileSec <- tokWhile * exprSec * blockSec: ps.swap() ps.add Node(kind: nkWhile, children: @[ps.pop(), ps.pop()]) # If expressions ifOpen <- tokIf * exprSec * blockSec: let (nBlock, nExpr) = (ps.pop(), ps.pop()) ps.add Node(kind: nkIf, children: @[nExpr, nBlock]) ifElif <- (tokElif * exprSec * blockSec): ps.swap() ps.addtoParent ps.pop(), ps.pop() ifElse <- ?(tokElse * blockSec): ps.addToParent ps.pop() ifExpr <- ifOpen * *ifElif * ?ifElse # Object objectSec <- tokObject * tokIdent * tokCurOpen * objFields * tokCurClose objFields <- tokIdent * *(tokComma * tokIdent) * tokColon * tokType stmt <- blockSec | varSec | objectSec | whileSec | exprSec rod <- S * +stmt * !1 # Expressions: Pratt parser exprSec <- exp exp <- S * prefix * *infix prefix <- ifExpr | tokBool | tokNumber | parenExp | uniMinus | tokIdent uniMinus <- >'-' * exp parenExp <- ( tokParOpen * exp * tokParClose ) ^ 0 infix <- >("not" | "->" | "$") * exp ^ 1 | >("=") * exp ^ 2 | >("or" | "xor") * exp ^ 3 | >("and") * exp ^ 4 | >("==" | "<=" | "<" | ">=" | ">" | "!=" | "in" | "notin" | "is" | "isnot" | "of") * exp ^ 5 | >(".." | "..<") * exp ^ 6 | >("&") * exp ^ 7 | >("+" | "-") * exp ^ 8 | >("*" | "/" | "%") * exp ^ 9 | >("div" | "mod" | "shl" | "shr") * exp ^ 10 | >("^") * exp ^^ 11: let (f2, f1) = (ps.pop(), ps.pop()) ps.add Node(kind: nkInfix, children: @[Node(kind: nkIdent, ident: $1), f1, f2]) proc compile(source:string) = var ps: ParseStack echo "---------------" echo source if p.match(source, ps).ok: echo "---------------" let n = Node(kind: nkBlock, children: ps) echo n when false: compile """ if a > 3 { var w = 42 } """ when false: compile(""" var a = 2 + 2, b = 2 + a """) when true: compile(""" { var a = 10 { var a = a } } { var a = 12 a = a + 3 } """) when false: compile(""" let x = true if x { var x = 2 } """) when false: compile(""" let x = true if x { var x = 2 } elif false { var y = 3 } elif false { var z = 4 } else { var w = 5 } """) when false: compile(""" let x = if true { 2 } else { 4 } """) when false: compile(""" let x = true while x { let y = 1 } """) when false: compile(""" while true { let y = 1 } """) when false: compile(""" while false { let y = 1 } """) when false: compile(""" var x = 0, stop = false while x { } """) ================================================ FILE: npeg.nimble ================================================ # Package version = "1.3.0" author = "Ico Doornekamp" description = "a PEG library" license = "MIT" srcDir = "src" installExt = @["nim"] # Dependencies requires "nim >= 0.19.0" # Test task test, "Runs the test suite": exec "nimble testc && nimble testcpp && nimble testarc && nimble testjs" task testc, "C tests": exec "nim c -r tests/tests.nim" task testcpp, "CPP tests": exec "nim cpp -r tests/tests.nim" task testjs, "JS tests": exec "nim js -r tests/tests.nim" task testdanger, "Runs the test suite in danger mode": exec "nim c -d:danger -r tests/tests.nim" task testwin, "Mingw tests": exec "nim c -d:mingw tests/tests.nim && wine tests/tests.exe" task test32, "32 bit tests": exec "nim c --cpu:i386 --passC:-m32 --passL:-m32 tests/tests.nim && tests/tests" task testall, "Test all": exec "nimble test && nimble testcpp && nimble testdanger && nimble testjs && nimble testwin" when (NimMajor, NimMinor) >= (1, 1): task testarc, "--gc:arc tests": exec "nim c --gc:arc -r tests/tests.nim" else: task testarc, "--gc:arc tests": exec "true" task perf, "Test performance": exec "nim cpp -r -d:danger tests/performance.nim" ================================================ FILE: src/npeg/capture.nim ================================================ import strutils import sequtils import npeg/[stack,common] type Capture*[S] = object ck: CapKind si*: int name: string len: int when S is char: s*: string else: s*: S Captures*[S] = object capList*: seq[Capture[S]] FixMethod* = enum FixAll, FixOpen # Search the capStack for cftOpen matching the cftClose on top proc findTop[S](capStack: var Stack[CapFrame[S]], fm: FixMethod): int = if fm == FixOpen: var i = capStack.top - 1 var depth = 0 while true: if capStack[i].cft == cftClose: inc depth else: dec depth if depth == 0: break dec i result = i # Convert all closed CapFrames on the capture stack to a list of Captures, all # consumed frames are removed from the CapStack proc fixCaptures*[S](s: openArray[S], capStack: var Stack[CapFrame[S]], fm: FixMethod): Captures[S] = assert capStack.top > 0 assert capStack.peek.cft == cftClose when npegDebug: echo $capStack # Convert the closed frames to a seq[Capture] var stack = initStack[int]("captures", 8) let iFrom = findTop(capStack, fm) for i in iFrom..= cs.capList.len: let msg = "Capture out of range, " & $i & " is not in [0.." & $cs.capList.high & "]" raise newException(NPegCaptureOutOfRangeError, msg) cs.capList[i] proc `[]`*[S](cs: Captures[S], i: int): Capture[S] = cs.getCapture(i) proc `[]`*[S](cs: Captures[S], i: BackwardsIndex): Capture[S] = cs.getCapture(cs.capList.len-i.int) proc `[]`*[S](cs: Captures[S], range: HSlice[system.int, system.int]): seq[Capture[S]] = for i in range: result.add cs.getCapture(i) iterator items*[S](captures: Captures[S]): Capture[S] = for c in captures.capList: yield c proc len*[S](captures: Captures[S]): int = captures.capList.len ================================================ FILE: src/npeg/codegen.nim ================================================ import macros except quote, stamp import strutils import tables import npeg/[common,patt,stack,capture] type RetFrame = int BackFrame = object ip*: int # Instruction pointer si*: int # Subject index rp*: int # Retstack top pointer cp*: int # Capstack top pointer pp*: int # PrecStack top pointer PrecFrame = int MatchResult*[S] = object ok*: bool matchLen*: int matchMax*: int cs*: Captures[S] MatchState*[S] = object ip*: int si*: int simax*: int refs*: Table[string, string] retStack*: Stack[RetFrame] capStack*: Stack[CapFrame[S]] backStack*: Stack[BackFrame] precStack*: Stack[PrecFrame] Parser*[S, T] = object fn_init*: proc(): MatchState[S] when npegGcsafe: fn_run*: proc(ms: var MatchState[S], s: openArray[S], u: var T): MatchResult[S] {.gcsafe.} else: fn_run*: proc(ms: var MatchState[S], s: openArray[S], u: var T): MatchResult[S] when declared(macros.stamp): # nimskull template quote(body: untyped): NimNode = macros.stamp(body) else: template quote(body: untyped): NimNode = macros.quote(body) # This macro translates `$1`.. into `capture[1].s`.. and `@1` into `capture[1].si` # for use in code block captures. The source nimnode lineinfo is recursively # copied to the newly genreated node to make sure "Capture out of range" # exceptions are properly traced. proc doSugar(n, captureId: NimNode): NimNode = proc cli(n2: NimNode) = n2.copyLineInfo(n) for nc in n2: cli(nc) let isIntPrefix = n.kind == nnkPrefix and n[0].kind == nnkIdent and n[1].kind == nnkIntLit if isIntPrefix and n[0].eqIdent("$"): result = newDotExpr(nnkBracketExpr.newTree(captureId, n[1]), ident("s")) cli result elif isIntPrefix and n[0].eqIdent("@"): result = newDotExpr(nnkBracketExpr.newTree(captureId, n[1]), ident("si")) cli result else: result = copyNimNode(n) for nc in n: result.add doSugar(nc, captureId) # Generate the parser main loop. The .computedGoto. pragma will generate code # using C computed gotos, which will get highly optmized, mostly eliminating # the inner parser loop. Nim limits computed goto to a maximum of 10_000 # cases; if our program is this large, emit a warning and do not use a # computed goto proc genLoopCode(program: Program, casesCode: NimNode): NimNode= result = nnkWhileStmt.newTree(true.newLit, nnkStmtList.newTree()) if program.patt.len < 10_000: result[1].add nnkPragma.newTree("computedGoto".ident) else: warning "Grammar too large for computed goto, falling back to normal 'case'" result[1].add casesCode # Generate out all the case handlers for the parser program proc genCasesCode*(program: Program, sType, uType, uId: NimNode, ms, s, si, simax, ip: NimNode): NimNode = result = quote: case `ip` for ipNow, i in program.patt.pairs: let ipNext = ipNow + 1 opName = newLit(repeat(" ", i.indent) & ($i.op).toLowerAscii[2..^1]) iname = newLit(i.name) ipFail = if i.failOffset == 0: program.patt.high else: ipNow + i.failOffset var call = case i.op: of opChr: let ch = newLit(i.ch) quote: trace `ms`, `iname`, `opName`, `s`, "\"" & escapeChar(`ch`) & "\"" if `si` < `s`.len and `s`[`si`] == `ch`.char: inc `si` `ip` = `ipNext` else: `ip` = `ipFail` of opLit: let lit = i.lit quote: trace `ms`, `iname`, `opName`, `s`, `lit`.repr if `si` < `s`.len and `s`[`si`] == `lit`: inc `si` `ip` = `ipNext` else: `ip` = `ipFail` of opSet: let cs = newLit(i.cs) quote: trace `ms`, `iname`, `opName`, `s`, dumpSet(`cs`) if `si` < `s`.len and `s`[`si`] in `cs`: inc `si` `ip` = `ipNext` else: `ip` = `ipFail` of opSpan: let cs = newLit(i.cs) quote: trace `ms`, `iname`, `opName`, `s`, dumpSet(`cs`) while `si` < `s`.len and `s`[`si`] in `cs`: inc `si` `ip` = `ipNext` of opChoice: let ip2 = newLit(ipNow + i.ipOffset) let siOffset = newLit(i.siOffset) quote: trace `ms`, `iname`, `opName`, `s`, $`ip2` push(`ms`.backStack, BackFrame(ip:`ip2`, si:`si`+`siOffset`, rp:`ms`.retStack.top, cp:`ms`.capStack.top, pp:`ms`.precStack.top)) `ip` = `ipNext` of opCommit: let ip2 = newLit(ipNow + i.ipOffset) quote: trace `ms`, `iname`, `opName`, `s`, $`ip2` discard pop(`ms`.backStack) `ip` = `ip2` of opCall: let label = newLit(i.callLabel) let ip2 = newLit(ipNow + i.callOffset) quote: trace `ms`, `iname`, `opName`, `s`, `label` & ":" & $`ip2` push(`ms`.retStack, `ipNext`) `ip` = `ip2` of opJump: let label = newLit(i.callLabel) let ip2 = newLit(ipNow + i.callOffset) quote: trace `ms`, `iname`, `opName`, `s`, `label` & ":" & $`ip2` `ip` = `ip2` of opCapOpen: let capKind = newLit(i.capKind) let capName = newLit(i.capName) let capSiOffset = newLit(i.capSiOffset) quote: trace `ms`, `iname`, `opName`, `s`, $`capKind` & " -> " & $`si` push(`ms`.capStack, CapFrame[`sType`](cft: cftOpen, si: `si`+`capSiOffset`, ck: `capKind`, name: `capName`)) `ip` = `ipNext` of opCapClose: let ck = newLit(i.capKind) case i.capKind: of ckCodeBlock: let captureId = ident "capture" let code = doSugar(i.capAction, captureId) quote: trace `ms`, `iname`, `opName`, `s`, "ckCodeBlock -> " & $`si` push(`ms`.capStack, CapFrame[`sType`](cft: cftClose, si: `si`, ck: `ck`)) let capture = collectCaptures(fixCaptures[`sType`](`s`, `ms`.capStack, FixOpen)) proc fn(`captureId`: Captures[`sType`], `ms`: var MatchState[`sType`], `uId`: var `uType`): bool = result = true `code` if fn(capture, `ms`, `uId`): `ip` = `ipNext` else: `ip` = `ipFail` of ckRef: quote: trace `ms`, `iname`, `opName`, `s`, "ckRef -> " & $`si` push(`ms`.capStack, CapFrame[`sType`](cft: cftClose, si: `si`, ck: `ck`)) let r = collectCapturesRef(fixCaptures[`sType`](`s`, `ms`.capStack, FixOpen)) `ms`.refs[r.key] = r.val `ip` = `ipNext` else: quote: trace `ms`, `iname`, `opName`, `s`, $`ck` & " -> " & $`si` push(`ms`.capStack, CapFrame[`sType`](cft: cftClose, si: `si`, ck: `ck`)) `ip` = `ipNext` of opBackref: let refName = newLit(i.refName) quote: if `refName` in `ms`.refs: let s2 = `ms`.refs[`refName`] trace `ms`, `iname`, `opName`, `s`, `refName` & ":\"" & s2 & "\"" if subStrCmp(`s`, `s`.len, `si`, s2): inc `si`, s2.len `ip` = `ipNext` else: `ip` = `ipFail` else: raise newException(NPegUnknownBackrefError, "Unknown back reference '" & `refName` & "'") of opErr: let msg = newLit(i.msg) quote: trace `ms`, `iname`, `opName`, `s`, `msg` var e = newException(NPegParseError, `msg`) `simax` = max(`simax`, `si`) raise e of opReturn: quote: trace `ms`, `iname`, `opName`, `s` if `ms`.retStack.top > 0: `ip` = pop(`ms`.retStack) else: result.ok = true `simax` = max(`simax`, `si`) break of opAny: quote: trace `ms`, `iname`, `opName`, `s` if `si` < `s`.len: inc `si` `ip` = `ipNext` else: `ip` = `ipFail` of opNop: quote: trace `ms`, `iname`, `opName`, `s` `ip` = `ipNext` of opPrecPush: if i.prec == 0: quote: push(`ms`.precStack, 0) `ip` = `ipNext` else: let (iPrec, iAssoc) = (i.prec.newLit, i.assoc.newLit) let exp = if i.assoc == assocLeft: quote: peek(`ms`.precStack) < `iPrec` else: quote: peek(`ms`.precStack) <= `iPrec` quote: if `exp`: push(`ms`.precStack, `iPrec`) `ip` = `ipNext` else: `ip` = `ipFail` of opPrecPop: quote: discard `ms`.precStack.pop() `ip` = `ipNext` of opFail: quote: `simax` = max(`simax`, `si`) if `ms`.backStack.top > 0: trace `ms`, "", "opFail", `s`, "(backtrack)" let t = pop(`ms`.backStack) (`ip`, `si`, `ms`.retStack.top, `ms`.capStack.top, `ms`.precStack.top) = (t.ip, t.si, t.rp, t.cp, t.pp) else: trace `ms`, "", "opFail", `s`, "(error)" break # Recursively copy the line info from the original instruction NimNode into # the generated Nim code proc aux(n: NimNode) = n.copyLineInfo(i.nimNode) for nc in n: aux(nc) aux(call) result.add nnkOfBranch.newTree(newLit(ipNow), call) # Generate code for tracing the parser. An empty stub is generated if tracing # is disabled proc genTraceCode*(program: Program, sType, uType, uId, ms, s, si, simax, ip: NimNode): NimNode = when npegTrace: result = quote: proc doTrace[sType](`ms`: var MatchState, iname, opname: string, ip: int, s: openArray[sType], si: int, ms: var MatchState, msg: string) {.nimcall.} = echo align(if ip >= 0: $ip else: "", 3) & "|" & align($(peek(ms.precStack)), 3) & "|" & align($si, 3) & "|" & alignLeft(dumpSubject(s, si, 24), 24) & "|" & alignLeft(iname, 15) & "|" & alignLeft(opname & " " & msg, 40) & "|" & repeat("*", ms.backStack.top) template trace(`ms`: var MatchState, iname, opname: string, `s`: openArray[`sType`], msg = "") = doTrace(`ms`, iname, opname, `ip`, `s`, `si`, `ms`, msg) else: result = quote: template trace(`ms`: var MatchState, iname, opname: string, `s`: openArray[`sType`], msg = "") = discard # Augment exception stack traces with the NPeg return stack and re-raise proc genExceptionCode(ms, ip, si, simax, symTab: NimNode): NimNode = quote: # Helper proc to add a stack frame for the given ip var trace: seq[StackTraceEntry] let symTab = `symTab` proc aux(ip: int) = let sym = symTab[ip] trace.insert StackTraceEntry(procname: cstring(sym.repr), filename: cstring(sym.lineInfo.filename), line: sym.lineInfo.line) # On older Nim versions e.trace is not accessible, in this case just # dump the exception to stdout if npgStacktrace is enabled when npegStacktrace: echo $(sym.lineInfo) & ": " & sym.repr # Emit current IP and unwind all addresses from the return stack aux(`ip`) while `ms`.retStack.top > 0: aux(`ms`.retStack.pop()) let e = getCurrentException() when compiles(e.trace.pop()): # drop the generated parser fn() from the trace and replace by the NPeg frames discard e.trace.pop() e.trace.add trace # Re-reaise the exception with the augmented stack trace and match index filled in if e of NPegException: let eref = (ref NPegException)(e) eref.matchLen = `si` eref.matchMax = `simax` raise # Convert the list of parser instructions into a Nim finite state machine # # - sType is the base type of the subject; typically `char` but can be specified # to be another type by the user # - uType is the type of the userdata, if not used this defaults to `bool` # - uId is the identifier of the userdata, if not used this defaults to `userdata` proc genCode*(program: Program, sType, uType, uId: NimNode): NimNode = let count = program.patt.high suffix = "_NP" ms = ident "ms" & suffix s = ident "s" & suffix si = ident "si" & suffix ip = ident "ip" & suffix simax = ident "simax" & suffix casesCode = genCasesCode(program, sType, uType, uId, ms, s, si, simax, ip) loopCode = genLoopCode(program, casesCode) traceCode = genTraceCode(program, sType, uType, uId, ms, s, si, simax, ip) exceptionCode = genExceptionCode(ms, ip, si, simax, newLit(program.symTab)) result = quote: proc fn_init(): MatchState[`sType`] {.gensym.} = result = MatchState[`sType`]( retStack: initStack[RetFrame]("return", 8, npegRetStackSize), capStack: initStack[CapFrame[`sType`]]("capture", 8), backStack: initStack[BackFrame]("backtrace", 8, npegBackStackSize), precStack: initStack[PrecFrame]("precedence", 8, 16), ) push(result.precStack, 0) proc fn_run(`ms`: var MatchState[`sType`], `s`: openArray[`sType`], `uId`: var `uType`): MatchResult[`sType`] {.gensym.} = # Create local instances of performance-critical MatchState vars, this # saves a dereference on each access var `ip`: range[0..`count`] = `ms`.ip `si` = `ms`.si `simax` = `ms`.simax # These templates are available for code blocks template validate(o: bool) {.used.} = if not o: return false template fail() {.used.} = return false template push(`s`: string|`sType`) {.used.} = push(`ms`.capStack, CapFrame[`sType`](cft: cftOpen, ck: ckPushed)) push(`ms`.capStack, CapFrame[`sType`](cft: cftClose, ck: ckPushed, sPushed: `s`)) # Emit trace and loop code try: `traceCode` `loopCode` except CatchableError: `exceptionCode` # When the parsing machine is done, copy the local copies of the # matchstate back, close the capture stack and collect all the captures # in the match result `ms`.ip = `ip` `ms`.si = `si` `ms`.simax = `simax` result.matchLen = `ms`.si result.matchMax = `ms`.simax if result.ok and `ms`.capStack.top > 0: result.cs = fixCaptures(`s`, `ms`.capStack, FixAll) # This is the result of genCode: a Parser object with two function # pointers: fn_init: initializes a MatchState object for this parser # fn_run: performs the parsing of the subject on the given matchstate Parser[`sType`,`uType`](fn_init: fn_init, fn_run: fn_run) when npegGcsafe: result[0].addPragma(ident("gcsafe")) when npegExpand: echo repr result ================================================ FILE: src/npeg/common.nim ================================================ import strutils import tables import macros import bitops const # Some constants with "sane" defaults, configurable with compiler flags npegPattMaxLen* {.intdefine.} = 4096 npegInlineMaxLen* {.intdefine.} = 30 npegRetStackSize* {.intdefine.} = 1024 npegBackStackSize* {.intdefine.} = 1024 npegOptimize* {.intdefine.} = 255 npegDebug* = defined(npegDebug) npegTrace* = defined(npegTrace) npegExpand* = defined(npegExpand) npegGraph* = defined(npegGraph) npegGcsafe* = defined(npegGcsafe) npegStacktrace* = defined(npegStacktrace) # Various optimizations. These can be disabled for testing purposes # or when suspecting bugs in the optimization stages npegOptSets* = npegOptimize.testBit(0) npegOptHeadFail* = npegOptimize.testBit(1) npegOptCapShift* = npegOptimize.testBit(2) npegOptChoiceCommit* = npegOptimize.testBit(3) type NPegException* = object of CatchableError matchLen*: int matchMax*: int NPegParseError* = object of NPegException NPegStackOverflowError* = object of NPegException NPegUnknownBackrefError* = object of NPegException NPegCaptureOutOfRangeError* = object of NPegException CapFrameType* = enum cftOpen, cftClose CapKind* = enum ckVal, # Value capture ckPushed, # Pushed capture ckCodeBlock, # Code block capture ckRef # Reference CapFrame*[S] = object cft*: CapFrameType # Capture frame type name*: string # Capture name si*: int # Subject index ck*: CapKind # Capture kind when S is char: sPushed*: string # Pushed capture, overrides subject slice else: sPushed*: S # Pushed capture, overrides subject slice Ref* = object key*: string val*: string Opcode* = enum opChr, # Matching: Character opLit, # Matching: Literal opSet, # Matching: Character set and/or range opAny, # Matching: Any character opNop, # Matching: Always matches, consumes nothing opSpan # Matching: Match a sequence of 0 or more character sets opChoice, # Flow control: stores current position opCommit, # Flow control: commit previous choice opCall, # Flow control: call another rule opJump, # Flow control: jump to target opReturn, # Flow control: return from earlier call opFail, # Fail: unwind stack until last frame opCapOpen, # Capture open opCapClose, # Capture close opBackref # Back reference opErr, # Error handler opPrecPush, # Precedence stack push opPrecPop, # Precedence stack pop CharSet* = set[char] Assoc* = enum assocLeft, assocRight Inst* = object case op*: Opcode of opChoice, opCommit: ipOffset*: int siOffset*: int of opChr: ch*: char of opLit: lit*: NimNode of opCall, opJump: callLabel*: string callOffset*: int of opSet, opSpan: cs*: CharSet of opCapOpen, opCapClose: capKind*: CapKind capAction*: NimNode capName*: string capSiOffset*: int of opErr: msg*: string of opFail, opReturn, opAny, opNop, opPrecPop: discard of opBackref: refName*: string of opPrecPush: prec*: int assoc*: Assoc failOffset*: int # Debug info name*: string nimNode*: NimNode indent*: int Patt* = seq[Inst] Symbol* = object ip*: int name*: string repr*: string lineInfo*: LineInfo SymTab* = object syms*: seq[Symbol] Rule* = object name*: string patt*: Patt repr*: string lineInfo*: LineInfo Program* = object patt*: Patt symTab*: SymTab Template* = ref object name*: string args*: seq[string] code*: NimNode Grammar* = ref object rules*: Table[string, Rule] templates*: Table[string, Template] # # SymTab implementation # proc add*(s: var SymTab, ip: int, name: string, repr: string = "", lineInfo: LineInfo = LineInfo()) = let symbol = Symbol(ip: ip, name: name, repr: repr, lineInfo: lineInfo) s.syms.add(symbol) proc `[]`*(s: SymTab, ip: int): Symbol = for sym in s.syms: if ip >= sym.ip: result = sym proc `[]`*(s: SymTab, name: string): Symbol = for sym in s.syms: if name == sym.name: return sym proc contains*(s: SymTab, ip: int): bool = for sym in s.syms: if ip == sym.ip: return true proc contains*(s: SymTab, name: string): bool = for sym in s.syms: if name == sym.name: return true # # Some glue to report parse errors without having to pass the original # NimNode all the way down the call stack # var gCurErrorNode {.compileTime} = newEmptyNode() proc setKrakNode*(n: NimNode) = gCurErrorNode.copyLineInfo(n) template krak*(n: NimNode, msg: string) = error "NPeg: error at '" & n.repr & "': " & msg & "\n", n template krak*(msg: string) = krak gCurErrorNode, msg # # Misc helper functions # proc subStrCmp*(s: openArray[char], slen: int, si: int, s2: string): bool = if si > slen - s2.len: return false for i in 0.. slen - s2.len: return false for i in 0.. len: result = result[0..len-1] & "..." # This macro flattens AST trees of `|` operators into a single call to # `choice()` with all arguments in one call. e.g, it will convert `A | B | C` # into `call(A, B, C)`. proc flattenChoice*(n: NimNode, nChoice: NimNode = nil): NimNode = proc addToChoice(n, nc: NimNode) = if n.kind == nnkInfix and n[0].eqIdent("|"): addToChoice(n[1], nc) addToChoice(n[2], nc) else: nc.add flattenChoice(n) if n.kind == nnkInfix and n[0].eqIdent("|"): result = nnkCall.newTree(ident "choice") addToChoice(n[1], result) addToChoice(n[2], result) else: result = copyNimNode(n) for nc in n: result.add flattenChoice(nc) # Create a short and friendly text representation of a character set. proc escapeChar*(c: char): string = const escapes = { '\n': "\\n", '\r': "\\r", '\t': "\\t" }.toTable() if c in escapes: result = escapes[c] elif c >= ' ' and c <= '~': result = $c else: result = "\\x" & toHex(c.int, 2).toLowerAscii proc dumpSet*(cs: CharSet): string = result.add "{" var c = 0 while c <= 255: let first = c while c <= 255 and c.char in cs: inc c if (c - 1 == first): result.add "'" & escapeChar(first.char) & "'," elif c - 1 > first: result.add "'" & escapeChar(first.char) & "'..'" & escapeChar((c-1).char) & "'," inc c if result[result.len-1] == ',': result.setLen(result.len-1) result.add "}" # Create a friendly version of the given string, escaping not-printables # and no longer then `l` proc dumpSubject*[S](s: openArray[S], o:int=0, l:int=1024): string = var i = o while i < s.len: when S is char: let a = escapeChar s[i] else: mixin repr let a = s[i].repr if result.len >= l-a.len: return result.add a inc i proc `$`*(i: Inst, ip=0): string = var args: string case i.op: of opChr: args = " '" & escapeChar(i.ch) & "'" of opChoice, opCommit: args = " " & $(ip+i.ipOffset) of opCall, opJump: args = " " & $(ip+i.callOffset) of opCapOpen, opCapClose: args = " " & $i.capKind if i.capSiOffset != 0: args &= "(" & $i.capSiOffset & ")" of opBackref: args = " " & i.refName of opPrecPush: args = " @" & $i.prec else: discard if i.failOffset != 0: args.add " " & $(ip+i.failOffset) let tmp = if i.nimNode != nil: i.nimNode.repr.truncate(30) else: "" result.add alignLeft(i.name, 15) & alignLeft(repeat(" ", i.indent) & ($i.op).toLowerAscii[2..^1] & args, 25) & " " & tmp proc `$`*(program: Program): string = for ip, i in program.patt.pairs: if ip in program.symTab: result.add "\n" & program.symTab[ip].repr & "\n" result.add align($ip, 4) & ": " & `$`(i, ip) & "\n" proc slice*(s: openArray[char], iFrom, iTo: int): string = let len = iTo - iFrom result.setLen(len) for i in 0.. " & n2.escape & " [ color=" & colors[meth] & "];" d.edges[l] = true proc addPatt*(d: Dot, name: string, len: int) = if d != nil: var color = "black" if len > 10: color = "orange" if len > 100: color = "red" d.nodes.add " " & name.escape & " [ fillcolor=lightgrey color=" & color & " label=\"" & name & "/" & $len & "\"];" proc dump*(d: Dot) = const npegDotDir {.strdefine.}: string = "" when npegDotDir != "": let fname = npegDotDir & "/" & d.name & ".dot" echo "Dumping dot graph file to " & fname & "..." var o: string o.add "digraph dot {\n" o.add " graph [ center=true, margin=0.2, nodesep=0.1, ranksep=0.3 ];\n" o.add " node [ shape=box, style=\"rounded,filled\" width=0, height=0, fontname=Helvetica, fontsize=10];\n" o.add " edge [ fontname=Helvetica, fontsize=10];\n" for k, v in d.edges: o.add k & "\n" for n in d.nodes: o.add n & "\n" o.add "}\n" writeFile fname, o ================================================ FILE: src/npeg/grammar.nim ================================================ import tables import macros import strutils import npeg/[common,dot] # This is the global instance of pattern library. This is itself a grammar # where all patterns are stored with qualified names in the form of # .. At grammar link time all unresolved patterns are # looked up from this global table. var gPattLib {.compileTime.} = new Grammar # Store a grammar in the library. The rule names and all unqualified # identifiers in the grammar are expanded to qualified names in the form # . to make sure they are easily resolved when they are # later imported by other grammars. proc libStore*(libName: string, grammar: Grammar) = proc qualify(name: string): string = if libName.len > 0: libName & "." & name else: name for rulename, rule in grammar.rules: var rulename2 = qualify(rulename) var rule2 = Rule(name: rulename2) for i in rule.patt.items: var i2 = i if i2.op == opCall: if "." notin i2.callLabel: i2.callLabel = qualify(i2.callLabel) rule2.patt.add i2 gPattLib.rules[rulename2] = rule2 for tname, t in grammar.templates: gPattLib.templates[qualify(tname)] = t # # Add rule to a grammer # proc addRule*(grammar: Grammar, name: string, patt: Patt, repr: string = "", lineInfo: LineInfo = LineInfo()) = if name in grammar.rules: warning "Redefinition of rule '" & name & "'" var rule = Rule(name: name, patt: patt, repr: repr, lineInfo: lineInfo) for i in rule.patt.mitems: if i.name == "": i.name = name grammar.rules[name] = rule # Try to import the given rule from the pattern library into a grammar. Returns # true if import succeeded, false if not found. proc libImportRule*(name: string, grammar: Grammar): bool = if name in gPattLib.rules: grammar.addRule name, gPattLib.rules[name].patt when npegDebug: echo "importing ", name return true proc libImportTemplate*(name: string): Template = if name in gPattLib.templates: result = gPattLib.templates[name] # Shadow the given name in the grammar by creating an unique new name, # and moving the original rule proc shadow*(grammar: Grammar, name: string): string = var gShadowId {.global.} = 0 inc gShadowId let name2 = name & "-" & $gShadowId when npegDebug: echo " shadow ", name, " -> ", name2 grammar.rules[name2] = grammar.rules[name] grammar.rules.del name return name2 # Link a list of patterns into a grammar, which is itself again a valid # pattern. Start with the initial rule, add all other non terminals and fixup # opCall addresses proc link*(grammar: Grammar, initial_name: string, dot: Dot = nil): Program = if initial_name notin grammar.rules: error "inital rule '" & initial_name & "' not found" var retPatt: Patt var symTab: SymTab var ruleRepr: Table[int, string] # Recursively emit a pattern and all patterns it calls which are # not yet emitted proc emit(name: string) = if npegDebug: echo "emit ", name let rule = grammar.rules[name] if rule.patt.len > 0: let ip = retPatt.len symTab.add(ip, name, rule.repr, rule.lineInfo) retPatt.add rule.patt retPatt.add Inst(op: opReturn, name: rule.patt[0].name) for i in rule.patt: if i.op == opCall and i.callLabel notin symTab: if i.callLabel notin grammar.rules and not libImportRule(i.callLabel, grammar): error "Npeg: rule \"" & name & "\" is referencing undefined rule \"" & i.callLabel & "\"" dot.add(name, i.callLabel, "call") emit i.callLabel emit initial_name # Fixup call addresses and do tail call optimization for ip, i in retPatt.mpairs: if i.op == opCall: i.callOffset = symTab[i.callLabel].ip - ip if i.op == opCall and retPatt[ip+1].op == opReturn: i.op = opJump # Choice/Commit pairs that touch because of head fail optimization can be # replaced by a jump and a nop when npegOptChoiceCommit: for i in 0..= T.low.BiggestInt and v <= T.high.BiggestInt grammar "types": bool <- "true" | "false" # Unsigned decimal uint <- +Digit uint8 <- >+uint: validate checkRange(uint8, parseInt, $1) uint16 <- >+uint: validate checkRange(uint16, parseInt, $1) uint32 <- >+uint: validate checkRange(uint32, parseInt, $1) # Signed decimal int <- ?'-' * uint int8 <- >int: validate checkRange(int8, parseInt, $1) int16 <- >int: validate checkRange(int16, parseInt, $1) int32 <- >int: validate checkRange(int32, parseInt, $1) int64 <- >int: validate checkRange(int64, parseInt, $1) # Hexadecimal hex <- '0' * {'x','X'} * +Digit hex8 <- >+uhex: validate checkRange(uint8, parseHexInt, $1) hex16 <- >+uhex: validate checkRange(uint16, parseHexInt, $1) hex32 <- >+uhex: validate checkRange(uint32, parseHexInt, $1) ================================================ FILE: src/npeg/lib/uri.nim ================================================ import npeg when defined(nimHasUsed): {.used.} # The grammar below is a literal translation of the ABNF notation of the # RFC. Optimizations can be made to limit backtracking, but this is a nice # example how to create a parser from a RFC protocol description. grammar "uri": URI <- scheme * ":" * hier_part * ?( "?" * query) * ?( "#" * fragment) * !1 hier_part <- "//" * authority * path URI_reference <- uri | relative_ref absolute_uri <- scheme * ":" * hier_part * ?( "?" * query) relative_ref <- relative_part * ?( "?" * query) * ?( "#" * fragment) relative_part <- "//" * authority * path_abempty | path_absolute | path_noscheme | path_empty scheme <- (Alpha * *( Alpha | Digit | "+" | "-" | "." )) authority <- ?(userinfo * "@") * host * ?( ":" * port) userinfo <- *(unreserved | pct_encoded | sub_delims | ":") host <- (IP_literal | IPv4address | reg_name) port <- *Digit IP_literal <- "[" * (IPv6address | IPvFuture) * "]" IPvFuture <- "v" * +Xdigit * "." * +(unreserved | sub_delims | ":") IPv6address <- (h16 * ":")[6] * ls32 | "::" * (h16 * ":")[5] * ls32 | ?( h16 ) * "::" * (h16 * ":")[4] * ls32 | ?( h16 * (":" * h16)[0..1] ) * "::" * (h16 * ":")[3] * ls32 | ?( h16 * (":" * h16)[0..2] ) * "::" * (h16 * ":")[2] * ls32 | ?( h16 * (":" * h16)[0..3] ) * "::" * (h16 * ":") * ls32 | ?( h16 * (":" * h16)[0..4] ) * "::" * ls32 | ?( h16 * (":" * h16)[0..5] ) * "::" * h16 | ?( h16 * (":" * h16)[0..6] ) * "::" h16 <- Xdigit[1..4] ls32 <- (h16 * ":" * h16) | IPv4address IPv4address <- dec_octet * "." * dec_octet * "." * dec_octet * "." * dec_octet dec_octet <- Digit[1..3] reg_name <- *(unreserved | pct_encoded | sub_delims) path <- path_abempty | # begins with "/" or is empty path_absolute | # begins with "/" but not "//" path_noscheme | # begins with a non-colon segment path_rootless | # begins with a segment path_empty # zero characters path_abempty <- (*( "/" * segment )) path_absolute <- ("/" * ?( segment_nz * *( "/" * segment ) )) path_noscheme <- (segment_nz_nc * *( "/" * segment )) path_rootless <- (segment_nz * *( "/" * segment )) path_empty <- 0 segment <- *pchar segment_nz <- +pchar segment_nz_nc <- +( unreserved | pct_encoded | sub_delims | "@" ) # non_zero_length segment without any colon ":" pchar <- unreserved | pct_encoded | sub_delims | ":" | "@" query <- *( pchar | "|" | "?" ) fragment <- *( pchar | "|" | "?" ) pct_encoded <- "%" * Xdigit * Xdigit unreserved <- Alpha | Digit | "-" | "." | "_" | "~" reserved <- gen_delims | sub_delims gen_delims <- ":" | "|" | "?" | "#" | "[" | "]" | "@" sub_delims <- "!" | "$" | "&" | "'" | "(" | ")" | "*" | "+" | "," | ";" | "=" ================================================ FILE: src/npeg/lib/utf8.nim ================================================ import npeg when defined(nimHasUsed): {.used.} grammar "utf8": cont <- {128..191} # Matches any utf-8 codepoint glyph any <- {0..127} | {194..223} * cont[1] | {224..239} * cont[2] | {240..244} * cont[3] bom <- "\xff\xfe" # Check for UTF-8 character classes. Depends on the tables from # the nim unicode module space <- >utf8.any: validate unicode.isSpace($1) lower <- >utf8.any: validate unicode.isLower(runeAt($1, 0)) upper <- >utf8.any: validate unicode.isUpper(runeAt($1, 0)) alpha <- >utf8.any: validate unicode.isAlpha(runeAt($1, 0)) title <- >utf8.any: validate unicode.isTitle(runeAt($1, 0)) ================================================ FILE: src/npeg/parsepatt.nim ================================================ import tables, macros, sequtils, strutils, algorithm import npeg/[common,patt,dot,grammar] when npegGraph: import npeg/[railroad] # Recursively compile a PEG rule to a Pattern proc parsePatt*(pattName: string, nn: NimNode, grammar: Grammar, dot: Dot = nil): Patt = when npegDebug: echo "parse ", pattName, " <- ", nn.repr proc aux(n: NimNode): Patt = setKrakNode(n) proc inlineOrCall(callName: string): Patt = # Try to import symbol early so we might be able to inline or shadow it if callName notin grammar.rules: discard libImportRule(callName, grammar) if pattName == callName: if pattName in grammar.rules: let nameShadowed = grammar.shadow(pattName) return newCallPatt(nameShadowed) if callName in grammar.rules and grammar.rules[callName].patt.len < npegInlineMaxLen: when npegDebug: echo " inline ", callName dot.add(pattName, callName, "inline") return grammar.rules[callName].patt else: when npegDebug: echo " call ", callName dot.add(pattName, callName, "call") return newCallPatt(callName) proc applyTemplate(tName: string, arg: NimNode): NimNode = let t = if tName in grammar.templates: grammar.templates[tName] else: libImportTemplate(tName) if t != nil: if arg.len-1 != t.args.len: krak arg, "Wrong number of arguments for template " & tName & "(" & $(t.args.join(",")) & ")" proc aux(n: NimNode): NimNode = if n.kind == nnkIdent and n.strVal in t.args: result = arg[ find(t.args, n.strVal)+1 ] else: result = copyNimNode(n) for nc in n: result.add aux(nc) result = aux(t.code).flattenChoice() when npegDebug: echo "template ", tName, " = \n in: ", n.repr, "\n out: ", result.repr case n.kind: of nnkPar: if n.len > 1: krak n, "syntax error. Did you mean '|'?" result = aux n[0] of nnkIntLit: result = newPatt(n.intVal) of nnkStrLit: result = newPatt(n.strVal) of nnkCharLit: result = newPatt($n.intVal.char) of nnkCall: var name: string if n[0].kind == nnkIdent: name = n[0].strVal elif n[0].kind == nnkDotExpr: name = n[0].repr else: krak n, "syntax error" let n2 = applyTemplate(name, n) if n2 != nil: result = aux n2 elif name == "choice": result = choice(n[1..^1].map(aux)) elif n.len == 2: case name of "R": result = newBackrefPatt(n[1].strVal) elif n.len == 3: case name of "R": result = newPatt(aux n[2], ckRef, n[1].strVal) if result.len == 0: krak n, "Unknown template or capture '" & name & "'" of nnkPrefix: # Nim combines all prefix chars into one string. Handle prefixes # chars right to left var p = aux n[1] for c in n[0].strVal.reversed: case c: of '?': p = ?p of '+': p = +p of '*': p = *p of '!': p = !p of '&': p = &p of '>': p = >p of '@': p = @p else: krak n, "Unhandled prefix operator" result = p of nnkInfix: case n[0].strVal: of "*", "∙": result = aux(n[1]) * aux(n[2]) of "-": result = aux(n[1]) - aux(n[2]) of "^": result = newPattAssoc(aux(n[1]), intVal(n[2]), assocLeft) of "^^": result = newPattAssoc(aux(n[1]), intVal(n[2]), assocRight) else: krak n, "Unhandled infix operator" of nnkBracketExpr: let p = aux(n[0]) if n[1].kind == nnkIntLit: result = p{n[1].intVal} elif n[1].kind == nnkInfix and n[1][0].eqIdent(".."): result = p{n[1][1].intVal..n[1][2].intVal} else: krak n, "syntax error" of nnkIdent: result = inlineOrCall(n.strVal) of nnkDotExpr: result = inlineOrCall(n.repr) of nnkCurly: var cs: CharSet for nc in n: if nc.kind == nnkCharLit: cs.incl nc.intVal.char elif nc.kind == nnkInfix: if nc[0].kind == nnkIdent and nc[0].eqIdent(".."): for c in nc[1].intVal..nc[2].intVal: cs.incl c.char else: krak n, "syntax error" else: krak n, "syntax error" if cs.card == 0: result = newPatt(1) else: result = newPatt(cs) of nnkCallStrLit: case n[0].strVal: of "i": for c in n[1].strVal: result.add newPatt({c.toLowerAscii, c.toUpperAscii}) of "E": result = newErrorPatt(n[1].strVal) else: krak n, "unhandled string prefix" of nnkBracket: result.add newLitPatt n[0] else: echo n.astGenRepr krak n, "syntax error" for i in result.mitems: if i.nimNode == nil: i.nimNode = n result = aux(nn.flattenChoice()) dot.addPatt(pattName, result.len) # # Parse a grammar. A grammar consists of named rules, where each rule is one # pattern # proc parseGrammar*(ns: NimNode, dot: Dot=nil, dumpRailroad = true): Grammar = result = new Grammar for n in ns: if n.kind == nnkInfix and n[0].eqIdent("<-"): case n[1].kind of nnkIdent, nnkDotExpr, nnkPrefix: let name = if n[1].kind == nnkPrefix: when declared(expectIdent): expectIdent n[1][0], ">" n[1][1].repr else: n[1].repr var patt = parsePatt(name, n[2], result, dot) if n.len == 4: patt = newPatt(patt, ckCodeBlock) patt[patt.high].capAction = n[3] result.addRule(name, if n[1].kind == nnkPrefix: >patt else: patt, n.repr, n.lineInfoObj) when npegGraph: if dumpRailroad: echo parseRailroad(n[2], result).wrap(name) of nnkCall: if n.len > 3: error "Code blocks can not be used on templates", n[3] var t = Template(name: n[1][0].strVal, code: n[2]) for i in 1..= npegPattMaxLen: krak "NPeg: grammar too complex, (" & $p.len & " > " & $npegPattMaxLen & ").\n" & "If you think this is a mistake, increase the maximum size with -d:npegPattMaxLen=N" # Checks if the passed patt matches an empty subject. This is done by executing # the pattern as if it was passed an empty subject and see how it terminates. proc matchesEmpty(patt: Patt): bool = var backStack = initStack[int]("backtrack", 8, 32) var ip: int while ip < patt.len: let i = patt[ip] case i.op of opChoice: push(backStack, ip+i.ipOffset) inc ip of opCommit: discard pop(backStack) ip += i.ipOffset of opJump: ip += i.callOffset of opCapOpen, opCapClose, opNop, opSpan, opPrecPush, opPrecPop: inc ip of opErr, opReturn, opCall: return false of opAny, opChr, opLit, opSet, opBackref, opFail: if i.failOffset != 0: ip += i.failOffset elif backStack.top > 0: ip = pop(backStack) else: return false return true # Calculate how far captures or choices can be shifted into this pattern # without consequences; this allows the pattern to fail before pushing to the # backStack or capStack proc canShift(p: Patt, enable: static[bool]): (int, int) = let i = p[0] if i.failOffset == 0: case i.op of opChr, opAny, opSet: result = (1, 1) else: discard ### Atoms proc newPatt*(s: string): Patt = for ch in s: result.add Inst(op: opChr, ch: ch) proc newLitPatt*(n: NimNode): Patt = result.add Inst(op: opLit, lit: n) proc newPatt*(p: Patt, ck: CapKind, name = ""): Patt = let (siShift, ipShift) = p.canShift(npegOptCapShift) result.add p[0.. 0: for i in 1..n: result.add Inst(op: opAny) else: result.add Inst(op: opNop) proc newPatt*(cs: CharSet): Patt = result.add Inst(op: opSet, cs: cs) proc newBackrefPatt*(refName: string): Patt = result.add Inst(op: opBackref, refName: refName) proc newReturnPatt*(): Patt = result.add Inst(op: opReturn) proc newErrorPatt*(msg: string): Patt = result.add Inst(op: opErr, msg: msg) # Add a choice/commit pair around pattern P, try to optimize head # fails when possible proc addChoiceCommit(addTo: var Patt, p: Patt, choiceOffset, commitOffset: int) = let (siShift, ipShift) = p.canShift(npegOptHeadFail) for n in 0..`*(p: Patt): Patt = return newPatt(p, ckVal) proc `!`*(p: Patt): Patt = result.addChoiceCommit(p, p.len+3, 1) result.add Inst(op: opFail) proc `&`*(p: Patt): Patt = result.add !(!p) proc `@`*(p: Patt): Patt = result.addChoiceCommit(p, p.len+2, 3) result.add Inst(op: opAny) result.add Inst(op: opJump, callOffset: - p.len - 3) ### Infixes proc `*`*(p1, p2: Patt): Patt = result.add p1 result.add p2 result.checkSanity # choice() is generated from | operators by flattenChoice(). # # Optimizations done here: # - convert to union if all elements can be represented as a set # - head fails: when possible, opChoice is shifted into a pattern to # allow the pattern to fail before emitting the opChoice proc choice*(ps: openArray[Patt]): Patt = var csUnion: CharSet var allSets = true for p in ps: var cs: CharSet if p.toSet(cs): csUnion = csUnion + cs else: allSets = false if allSets: result.add Inst(op: opSet, cs: csUnion) return result var lenTot, ip: int lenTot = foldl(ps, a + b.len+2, 0) for i, p in ps: if i < ps.high: result.addChoiceCommit(p, p.len+2, lenTot-ip-p.len-3) ip += p.len + 2 else: result.add p proc `-`*(p1, p2: Patt): Patt = var cs1, cs2: CharSet if p1.toSet(cs1) and p2.toSet(cs2): result.add Inst(op: opSet, cs: cs1 - cs2) else: result.add !p2 result.add p1 proc newPattAssoc*(p: Patt, prec: BiggestInt, assoc: Assoc): Patt = result.add Inst(op: opPrecPush, prec: prec.int, assoc: assoc) result.add p result.add Inst(op: opPrecPop) ### Others proc `{}`*(p: Patt, n: BiggestInt): Patt = for i in 1..n: result.add p proc `{}`*(p: Patt, range: HSlice[system.BiggestInt, system.BiggestInt]): Patt = result.add p{range.a} for i in range.a.. n.w: l.len-n.w else: 0, 0, 1) for i, c in l: result.poke fgCap, (result.w/%2 - l.len/%2 + i, -1, $c) proc `*`(n1, n2: Node): Node = result = Node(w: n1.w + n2.w + 1, y0: min(n1.y0, n2.y0), y1: max(n1.y1, n2.y1)) result.poke fgGreen, (n1.w, 0, "»") result.kids.add Kid(n: n1, dx: 0) result.kids.add Kid(n: n2, dx: n1.w+1) proc `?`(n: Node): Node = result = n.pad(1, 1, 1, 0) let (x1, x2, y1, y2) = (0, n.w+1, -1 + n.y0, 0) result.poke fgLine, (x1, y1, "╭"), (x1, y2, "┴"), (x2, y1, "╮"), (x2, y2, "┴") for x in x1+1..x2-1: result.poke fgLine, (x, y1, "─") for y in y1+1..y2-1: result.poke fgLine, (x1, y, "│"), (x2, y, "│") result.poke fgLine, ((x1+x2)/%2, y1, "»") proc `+`(n: Node): Node = result = n.pad(1, 1, 0, 1) let (x1, x2, y1, y2) = (0, n.w+1, 0, n.y1+1) result.poke fgLine, (x1, y1, "┬"), (x1, y2, "╰"), (x2, y1, "┬"), (x2, y2, "╯") for x in x1+1..x2-1: result.poke fgLine, (x, y2, "─") for y in y1+1..y2-1: result.poke fgLine, (x1, y, "│"), (x2, y, "│") result.poke fgLine, ((x1+x2)/%2, y2, "«") proc `!`(n: Node): Node = result = n.pad(0, 0, 1) let (x0, x1) = (1, result.w-2) for x in x0..x1: result.poke fgRed, (x, result.y0, "━") proc `-`*(p1, p2: Node): Node = return !p2 * p1 proc `*`(n: Node): Node = ? + n proc `@`(n: Node): Node = result = *(!n * newNode("1")) * n proc `&`(n: Node): Node = result = ! ! n proc choice(ns: varArgs[Node]): Node = var wmax = 0 for n in ns: wmax = max(wmax, n.w) var dys = @[0] var dy = 0 for i in 0.. 0: result.poke fgLine, (x0, dys[i], "├"), (x1, dys[i], "┤") result.poke fgLine, (x0, dys[dys.high], "╰"), (x1, dys[dys.high], "╯") proc `{}`*(p: Node, n: BiggestInt): Node = result = p for i in 1..': p = newCapNode(p) else: p = p result = p of nnkInfix: case n[0].strVal: of "*", "∙": result = aux(n[1]) * aux(n[2]) of "-": result = aux(n[1]) - aux(n[2]) of "^": result = newPrecNode(aux(n[1]), intVal(n[2]), "<") of "^^": result = newPrecNode(aux(n[1]), intVal(n[2]), ">") else: discard of nnkBracketExpr: let p = aux(n[0]) if n[1].kind == nnkIntLit: result = p{n[1].intVal} elif n[1].kind == nnkInfix and n[1][0].eqIdent(".."): result = p{n[1][1].intVal..n[1][2].intVal} else: discard of nnkIdent: result = newNode("[" & n.strVal & "]", fgNonterm) of nnkDotExpr: result = newNode("[" & n.repr & "]", fgNonterm) of nnkCurly: var cs: CharSet for nc in n: if nc.kind == nnkCharLit: cs.incl nc.intVal.char elif nc.kind == nnkInfix: if nc[0].kind == nnkIdent and nc[0].eqIdent(".."): for c in nc[1].intVal..nc[2].intVal: cs.incl c.char if cs.card == 0: result = newNode("1", fgNonterm) else: result = newNode(dumpSet(cs), fgLit) of nnkCallStrLit: case n[0].strVal: of "i": result = newNode(n[1].strval) of "E": result = newNode("ERROR", fgError) of nnkBracket: result = newNode("[" & n[0].repr & "]", fgNonterm) else: discard let nnf = nn.flattenChoice result = aux(nnf) ================================================ FILE: src/npeg/stack.nim ================================================ # This module implements a basic stack[T]. This is used instead of seq[T] # because the latter has bad performance when unwinding more then one frame at # a time (ie, setlen). These stacks keep track of their own top and do not # shrink the underlying seq when popping or unwinding. type Stack*[T] = object name: string top*: int max: int frames: seq[T] proc `$`*[T](s: Stack[T]): string = for i in 0..= s.max: mixin NPegStackOverflowError raise newException(NPegStackOverflowError, s.name & " stack overflow, depth>" & $s.max) s.frames.setLen s.frames.len * 2 template push*[T](s: var Stack[T], frame: T) = if s.top >= s.frames.len: grow(s) s.frames[s.top] = frame inc s.top template pop*[T](s: var Stack[T]): T = assert s.top > 0 dec s.top s.frames[s.top] template peek*[T](s: Stack[T]): T = assert s.top > 0 s.frames[s.top-1] template `[]`*[T](s: Stack[T], idx: int): T = assert idx < s.top s.frames[idx] template update*[T](s: Stack[T], field: untyped, val: untyped) = assert s.top > 0 s.frames[s.top-1].field = val ================================================ FILE: src/npeg.nim ================================================ # # Copyright (c) 2019 Ico Doornekamp # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # This parser implementation is based on the following papers: # # - A Text Pattern-Matching Tool based on Parsing Expression Grammars # (Roberto Ierusalimschy) # # - An efficient parsing machine for PEGs # (Jos Craaijo) # ## Note: This document is rather terse, for the complete NPeg manual please refer ## to the README.md or the git project page at https://github.com/zevv/npeg ## ## NPeg is a pure Nim pattern matching library. It provides macros to compile ## patterns and grammars (PEGs) to Nim procedures which will parse a string and ## collect selected parts of the input. PEGs are not unlike regular ## expressions, but offer more power and flexibility, and have less ambiguities. ## ## Here is a simple example showing the power of NPeg: The macro `peg` compiles a ## grammar definition into a `parser` object, which is used to match a string and ## place the key-value pairs into the Nim table `words`: runnableExamples: import npeg, strutils, tables var words: Table[string, int] let parser = peg "pairs": pairs <- pair * *(',' * pair) * !1 word <- +Alpha number <- +Digit pair <- >word * '=' * >number: words[$1] = parseInt($2) doAssert parser.match("one=1,two=2,three=3,four=4").ok import tables import macros import strutils import npeg/[common,codegen,capture,parsepatt,grammar,dot] export NPegException, NPegStackOverflowError, NPegUnknownBackrefError, NPegCaptureOutOfRangeError, NpegParseError, contains, `[]`, len # Create a parser for a PEG grammar proc pegAux(name: string, subjectType, userDataType, userDataId, n: NimNode): NimNode = var dot = newDot(name) var grammar = parseGrammar(n, dot) var program = grammar.link(name, dot) let code = program.genCode(subjectType, userDataType, userDataId) dot.dump() code macro peg*(name: untyped, n: untyped): untyped = ## Construct a parser from the given PEG grammar. `name` is the initial ## grammar rule where parsing starts. This macro returns a `Parser` type ## which can later be used for matching subjects with the `match()` proc pegAux name.strVal, ident "char", ident "bool", ident "userdata", n macro peg*(name: untyped, userData: untyped, n: untyped): untyped = ## Construct a parser from the given PEG grammar. `name` is the initial ## grammar rule where parsing starts. This macro returns a `Parser` type ## which can later be used for matching subjects with the `match()` proc ## ## The `userdata` argument is a colon expression with an identifier and a ## type, this identifier is available in code block captions during parsing. expectKind(userData, nnkExprColonExpr) pegAux name.strVal, ident "char", userData[1], userData[0], n macro peg*(name: untyped, subjectType, userData, n: untyped): untyped = ## Construct a parser from the given PEG grammar. `name` is the initial ## grammar rule where parsing starts. This macro returns a `Parser` type ## which can later be used for matching subjects with the `match()` proc ## ## The `subjectType` argument is a Nim type which should match the base ## type of the subject passed to `match()`. ## ## The `userdata` argument is a colon expression with an identifier and a ## type, this identifier is available in code block captions during parsing. expectKind(userData, nnkExprColonExpr) pegAux name.strVal, subjectType, userData[1], userData[0], n template patt*(n: untyped): untyped = ## Construct a parser from a single PEG rule. This is similar to the regular ## `peg()` macro, but useful for short regexp-like parsers that do not need a ## complete grammar. peg anonymous: anonymous <- n template patt*(n: untyped, code: untyped): untyped = ## Construct a parser from a single PEG rule. This is similar to the regular ## `peg()` macro, but useful for short regexp-like parsers that do not need a ## complete grammar. This variant takes a code block which will be used as ## code block capture for the anonymous rule. peg anonymous: anonymous <- n: code macro grammar*(libNameNode: untyped, n: untyped) = ## This macro defines a collection of rules to be stored in NPeg's global ## grammar library. let libName = libNameNode.strVal let grammar = parseGrammar(n, dumpRailroad = libName != "") libStore(libName, grammar) proc match*[S, T](p: Parser, s: openArray[S], userData: var T): MatchResult[S] = ## Match a subject string with the given generic parser. The returned ## `MatchResult` contains the result of the match and can be used to query ## any captures. var ms = p.fn_init() p.fn_run(ms, s, userData) proc match*[S](p: Parser, s: openArray[S]): MatchResult[S] = ## Match a subject string with the given parser. The returned `MatchResult` ## contains the result of the match and can be used to query any captures. var userData: bool # dummy if user does not provide a type p.match(s, userData) # Match a file when defined(windows) or defined(posix): import memfiles, os proc matchFile*[T](p: Parser, fname: string, userData: var T): MatchResult[char] = # memfiles.open() throws on empty files, work around that if os.getFileSize(fname) > 0: var m = memfiles.open(fname) var a: ptr UncheckedArray[char] = cast[ptr UncheckedArray[char]](m.mem) var ms = p.fn_init() result = p.fn_run(ms, toOpenArray(a, 0, m.size-1), userData) m.close() else: result = match(p, "", userData) proc matchFile*(p: Parser, fname: string): MatchResult[char] = var userData: bool # dummy if user does not provide a type matchFile(p, fname, userData) proc captures*(mr: MatchResult[char]): seq[string] = ## Return all plain string captures from the match result for cap in collectCaptures(mr.cs): result.add cap.s proc captures*[S](mr: MatchResult[S]): seq[S] = ## Return all plain string captures from the match result for cap in collectCaptures(mr.cs): result.add cap.s template nimBug22740*() = ## Provide stub templates as a workaround for https://github.com/nim-lang/Nim/issues/22740. ## Invoke this template in your code if you want to define a parser in a generic proc. template `>`(a: untyped): untyped = discard template `*`(a: untyped): untyped = discard template `-`(a: untyped): untyped = discard template `+`(a: untyped): untyped = discard template `?`(a: untyped): untyped = discard template `!`(a: untyped): untyped = discard template `$`(a: untyped): untyped = discard import npeg/lib/core ================================================ FILE: tests/basics.nim ================================================ import unittest import strutils import npeg {.push warning[Spacing]: off.} suite "unit tests": test "atoms": doAssert patt(0 * "a").match("a").ok doAssert patt(1).match("a").ok doAssert patt(1).match("a").ok doAssert patt(2).match("a").ok == false doAssert patt("a").match("a").ok doAssert patt("a").match("b").ok == false doAssert patt("abc").match("abc").ok doAssert patt({'a'}).match("a").ok doAssert patt({'a'}).match("b").ok == false doAssert patt({'a','b'}).match("a").ok doAssert patt({'a','b'}).match("b").ok doAssert patt({'a','b'}).match("c").ok == false doAssert patt({'a'..'c'}).match("a").ok doAssert patt({'a'..'c'}).match("b").ok doAssert patt({'a'..'c'}).match("c").ok doAssert patt({'a'..'c'}).match("d").ok == false doAssert patt({'a'..'c'}).match("a").ok doAssert patt("").match("abcde").matchLen == 0 doAssert patt("a").match("abcde").matchLen == 1 doAssert patt("ab").match("abcde").matchLen == 2 doAssert patt(i"ab").match("AB").ok test "*: concatenation": doAssert patt("a" * "b").match("ab").ok #doAssert patt("a" ∙ "b").match("ab").ok test "?: zero or one": doAssert patt("a" * ?"b" * "c").match("abc").ok doAssert patt("a" * ?"b" * "c").match("ac").ok test "+: one or more": doAssert patt("a" * +"b" * "c").match("abc").ok doAssert patt("a" * +"b" * "c").match("abbc").ok doAssert patt("a" * +"b" * "c").match("ac").ok == false test "*: zero or more": doAssert patt(*'a').match("aaaa").ok doAssert patt(*'a' * 'b').match("aaaab").ok doAssert patt(*'a' * 'b').match("bbbbb").ok doAssert patt(*'a' * 'b').match("caaab").ok == false doAssert patt(+'a' * 'b').match("aaaab").ok doAssert patt(+'a' * 'b').match("ab").ok doAssert patt(+'a' * 'b').match("b").ok == false test "!: not predicate": doAssert patt('a' * !'b').match("ac").ok doAssert patt('a' * !'b').match("ab").ok == false test "&: and predicate": doAssert patt(&"abc").match("abc").ok doAssert patt(&"abc").match("abd").ok == false doAssert patt(&"abc").match("abc").matchLen == 0 test "@: search": doAssert patt(@"fg").match("abcdefghijk").matchLen == 7 test "[n]: count": doAssert patt(1[3]).match("aaaa").ok doAssert patt(1[4]).match("aaaa").ok doAssert patt(1[5]).match("aaaa").ok == false test "[m..n]: count": doAssert patt('a'[2..4] * !1).match("").ok == false doAssert patt('a'[2..4] * !1).match("a").ok == false doAssert patt('a'[2..4] * !1).match("aa").ok doAssert patt('a'[2..4] * !1).match("aaa").ok doAssert patt('a'[2..4] * !1).match("aaaa").ok doAssert patt('a'[2..4] * !1).match("aaaaa").ok == false doAssert patt('a'[0..1] * !1).match("").ok doAssert patt('a'[0..1] * !1).match("a").ok doAssert patt('a'[0..1] * !1).match("aa").ok == false test "|: ordered choice": doAssert patt("ab" | "cd").match("ab").ok doAssert patt("ab" | "cd").match("cd").ok doAssert patt("ab" | "cd").match("ef").ok == false doAssert patt(("ab" | "cd") | "ef").match("ab").ok == true doAssert patt(("ab" | "cd") | "ef").match("cd").ok == true doAssert patt(("ab" | "cd") | "ef").match("ef").ok == true doAssert patt("ab" | ("cd") | "ef").match("ab").ok == true doAssert patt("ab" | ("cd") | "ef").match("cd").ok == true doAssert patt("ab" | ("cd") | "ef").match("ef").ok == true test "-: difference": doAssert patt("abcd" - "abcdef").match("abcdefgh").ok == false doAssert patt("abcd" - "abcdf").match("abcdefgh").ok test "Builtins": doAssert patt(Digit).match("1").ok doAssert patt(Digit).match("a").ok == false doAssert patt(Upper).match("A").ok doAssert patt(Upper).match("a").ok == false doAssert patt(Lower).match("a").ok doAssert patt(Lower).match("A").ok == false doAssert patt(+Digit).match("12345").ok doAssert patt(+Xdigit).match("deadbeef").ok doAssert patt(+Graph).match(" x").ok == false test "Misc combos": doAssert patt('a' | ('b' * 'c')).match("a").ok doAssert patt('a' | ('b' * 'c') | ('d' * 'e' * 'f')).match("a").ok doAssert patt('a' | ('b' * 'c') | ('d' * 'e' * 'f')).match("bc").ok doAssert patt('a' | ('b' * 'c') | ('d' * 'e' * 'f')).match("def").ok test "Compile time 1": proc doTest(): string {.compileTime.} = var n: string let p = peg "number": number <- >+Digit: n = $1 doAssert p.match("12345").ok return n const v = doTest() doAssert v == "12345" test "Compile time 2": static: var n: string let p = peg "number": number <- >+Digit: n = $1 doAssert p.match("12345").ok doAssert n == "12345" test "matchMax": let s = peg "line": line <- one | two one <- +Digit * 'c' * 'd' * 'f' two <- +Digit * 'b' let r = s.match("1234cde") doAssert r.ok == false doAssert r.matchLen == 4 doAssert r.matchMax == 6 test "grammar1": let a = peg "r1": r1 <- "abc" r2 <- r1 * r1 doAssert a.match("abcabc").ok test "grammar2": let a = peg "r1": r2 <- r1 * r1 r1 <- "abc" doAssert a.match("abcabc").ok test "backref": doAssert patt(R("sep", Alpha) * *(1 - R("sep")) * R("sep") * !1).match("abbbba").ok doAssert patt(R("sep", Alpha) * *(1 - R("sep")) * R("sep") * !1).match("abbbbc").ok == false test "raise exception 1": let a = patt E"boom" expect NPegParseError: doAssert a.match("abcabc").ok test "raise exception 2": let a = patt 4 * E"boom" try: doAssert a.match("abcabc").ok except NPegParseError as e: doAssert e.matchLen == 4 doAssert e.matchMax == 4 test "out of range capture exception 1": expect NPegCaptureOutOfRangeError: let a = patt 1: echo capture[10].s doAssert a.match("c").ok test "out of range capture exception 2": expect NPegCaptureOutOfRangeError: let a = patt 1: echo $9 doAssert a.match("c").ok test "unknown backref error": expect NPegUnknownBackrefError: discard patt(R("sep", Alpha) * *(1 - R("sep")) * R("sap") * !1).match("abbbba") test "user validation": let p = peg "line": line <- uint8 * "," * uint8 * !1 uint8 <- >+Digit: let v = parseInt($1) validate(v>=0 and v<=255) doAssert p.match("10,10").ok doAssert p.match("0,255").ok doAssert not p.match("10,300").ok doAssert not p.match("300,10").ok test "user fail": let p = peg "line": line <- 1: fail() doAssert not p.match("a").ok test "templates": let p = peg "a": list(patt, sep) <- patt * *(sep * patt) commaList(patt) <- list(patt, ",") a <- commaList(>+Digit) doAssert p.match("11,22,3").captures == ["11","22","3"] test "templates with choices": let p = peg aap: one() <- "one" two() <- "one" three() <- "flip" | "flap" aap <- one() | two() | three() doAssert p.match("onetwoflip").ok ================================================ FILE: tests/captures.nim ================================================ import unittest import npeg import strutils import json {.push warning[Spacing]: off.} suite "captures": test "no captures": doAssert patt(1).match("a").captures == @[] test "string captures": doAssert patt(>1).match("ab").captures == @["a"] doAssert patt(>(>1)).match("ab").captures == @["a", "a"] doAssert patt(>1 * >1).match("ab").captures == @["a", "b"] doAssert patt(>(>1 * >1)).match("ab").captures == @["ab", "a", "b"] doAssert patt(>(>1 * >1)).match("ab").captures == @["ab", "a", "b"] test "code block captures": let p = peg "foo": foo <- >1: doAssert $1 == "a" doAssert @1 == 0 doAssert p.match("a").ok test "code block captures 2": let p = peg("foo", v: string): foo <- >1: v = $1 var a: string doAssert p.match("a", a).ok doAssert a == "a" test "code block captures 3": var a: string let p = patt >1: a = $1 doAssert p.match("a").ok doAssert a == "a" test "code block captures 4": let p = peg "foo": foo <- +Digit * >1: doAssert $1 == "a" doAssert @1 == 4 doAssert p.match("1234a").ok test "code block captures with typed parser": type Thing = object word: string number: int let s = peg("foo", t: Thing): foo <- word * number word <- >+Alpha: t.word = $1 number <- >+Digit: t.number = parseInt($1) var t = Thing() doAssert s.match("foo123", t).ok == true doAssert t.word == "foo" doAssert t.number == 123 when not defined(gcDestructors): test "Capture out of range": expect NPegException: let p = peg "l": l <- 1: echo $1 discard p.match("a") test "push": let p = peg "m": m <- >n * '+' * >n: push $(parseInt($1) + parseInt($2)) n <- +Digit let r = p.match("12+34") doAssert r.captures()[0] == "46" test "nested": doAssert patt(>(>1 * >1)).match("ab").captures == @["ab", "a", "b"] test "nested codeblock": let p = peg foo: foo <- >(>1 * b) b <- >1: push $1 doAssert p.match("ab").captures() == @["ab", "a", "b"] test "clyybber": let p = peg "m": m <- n * '+' * n: push $(parseInt($1) + parseInt($2)) >n <- +Digit let r = p.match("12+34") doAssert r.captures()[0] == "46" ================================================ FILE: tests/config.nims ================================================ switch("path", "$projectDir/../src") switch("hints", "off") ================================================ FILE: tests/examples.nim ================================================ import unittest import npeg import json import strutils import math import tables import npeg/lib/uri {.push warning[Spacing]: off.} suite "examples": ###################################################################### test "misc": let p1 = patt +{'a'..'z'} doAssert p1.match("lowercaseword").ok let p2 = peg "ident": lower <- {'a'..'z'} ident <- +lower doAssert p2.match("lowercaseword").ok ###################################################################### test "shadowing": let parser = peg "line": line <- uri.URI uri.scheme <- >uri.scheme uri.host <- >uri.host uri.port <- >+Digit uri.path <- >uri.path let r = parser.match("http://nim-lang.org:8080/one/two/three") doAssert r.captures == @["http", "nim-lang.org", "8080", "/one/two/three"] ###################################################################### test "matchFile": when defined(windows) or defined(posix): let parser = peg "pairs": pairs <- pair * *(',' * pair) word <- +Alnum number <- +Digit pair <- (>word * '=' * >number) let r = parser.matchFile "tests/testdata" doAssert r.ok doAssert r.captures == @["one", "1", "two", "2", "three", "3", "four", "4"] ###################################################################### test "JSON parser": let json = """ { "glossary": { "title": "example glossary", "GlossDiv": { "title": "S", "GlossList": { "GlossEntry": { "ID": "SGML", "SortAs": "SGML", "GlossTerm": "Standard Generalized Markup Language", "Acronym": "SGML", "Abbrev": "ISO 8879:1986", "GlossDef": { "para": "A meta-markup language, used to create markup languages such as DocBook.", "GlossSeeAlso": ["GML", "XML"] }, "GlossSee": "markup" } } } } } """ let s = peg "doc": S <- *Space jtrue <- "true" jfalse <- "false" jnull <- "null" unicodeEscape <- 'u' * Xdigit[4] escape <- '\\' * ({ '{', '"', '|', '\\', 'b', 'f', 'n', 'r', 't' } | unicodeEscape) stringBody <- ?escape * *( +( {'\x20'..'\xff'} - {'"'} - {'\\'}) * *escape) jstring <- ?S * '"' * stringBody * '"' * ?S minus <- '-' intPart <- '0' | (Digit-'0') * *Digit fractPart <- "." * +Digit expPart <- ( 'e' | 'E' ) * ?( '+' | '-' ) * +Digit jnumber <- ?minus * intPart * ?fractPart * ?expPart doc <- JSON * !1 JSON <- ?S * ( jnumber | jobject | jarray | jstring | jtrue | jfalse | jnull ) * ?S jobject <- '{' * ( jstring * ":" * JSON * *( "," * jstring * ":" * JSON ) | ?S ) * "}" jarray <- "[" * ( JSON * *( "," * JSON ) | ?S ) * "]" doAssert s.match(json).ok ###################################################################### test "HTTP with action captures to Nim object": type Request = object proto: string version: string code: int message: string headers: Table[string, string] let s = peg("http", userdata: Request): space <- ' ' crlf <- '\n' * ?'\r' url <- +(Alpha | Digit | '/' | '_' | '.') eof <- !1 header_name <- +(Alpha | '-') header_val <- +(1-{'\n'}-{'\r'}) proto <- >(+Alpha): userdata.proto = $1 version <- >(+Digit * '.' * +Digit): userdata.version = $1 code <- >+Digit: userdata.code = parseInt($1) msg <- >(+(1 - '\r' - '\n')): userdata.message = $1 header <- >header_name * ": " * >header_val: userdata.headers[$1] = $2 response <- proto * '/' * version * space * code * space * msg headers <- *(header * crlf) http <- response * crlf * headers * eof let data = """ HTTP/1.1 301 Moved Permanently Content-Length: 162 Content-Type: text/html Location: https://nim.org/ """ var req: Request let res = s.match(data, req) doAssert res.ok doAssert req.proto == "HTTP" doAssert req.version == "1.1" doAssert req.code == 301 doAssert req.message == "Moved Permanently" doAssert req.headers["Content-Length"] == "162" doAssert req.headers["Content-Type"] == "text/html" doAssert req.headers["Location"] == "https://nim.org/" ###################################################################### test "UTF-8": let b = " añyóng ♜♞♝♛♚♝♞♜ оживлённым " let m = peg "s": cont <- {128..191} utf8 <- {0..127} | {194..223} * cont[1] | {224..239} * cont[2] | {240..244} * cont[3] s <- *(@ > +(utf8-' ')) let r = m.match(b) doAssert r.ok let c = r.captures doAssert c == @["añyóng", "♜♞♝♛♚♝♞♜", "оживлённым"] ###################################################################### test "Back references": let p = peg "doc": S <- *Space doc <- +word * "<<" * R("sep", sep) * S * >heredoc * R("sep") * S * +word word <- +Alpha * S sep <- +Alpha heredoc <- +(1 - R("sep")) let d = """This is a <(Alpha * *( Alpha | Digit | "+" | "-" | "." )): userdata.scheme = $1 authority <- ?(userinfo * "@") * host * ?( ":" * port) userinfo <- >*(unreserved | pct_encoded | sub_delims | ":"): userdata.userinfo = $1 host <- >(IP_literal | IPv4address | reg_name): userdata.host = $1 port <- >*Digit: userdata.port = $1 IP_literal <- "[" * (IPv6address | IPvFuture) * "]" IPvFuture <- "v" * +Xdigit * "." * +(unreserved | sub_delims | ":") IPv6address <- (h16 * ":")[6] * ls32 | "::" * (h16 * ":")[5] * ls32 | ?( h16 ) * "::" * (h16 * ":")[4] * ls32 | ?( h16 * (":" * h16)[0..1] ) * "::" * (h16 * ":")[3] * ls32 | ?( h16 * (":" * h16)[0..2] ) * "::" * (h16 * ":")[2] * ls32 | ?( h16 * (":" * h16)[0..3] ) * "::" * (h16 * ":") * ls32 | ?( h16 * (":" * h16)[0..4] ) * "::" * ls32 | ?( h16 * (":" * h16)[0..5] ) * "::" * h16 | ?( h16 * (":" * h16)[0..6] ) * "::" h16 <- Xdigit[1..4] ls32 <- (h16 * ":" * h16) | IPv4address IPv4address <- dec_octet * "." * dec_octet * "." * dec_octet * "." * dec_octet dec_octet <- Digit | # 0-9 {'1'..'9'} * Digit | # 10-99 "1" * Digit * Digit | # 100-199 "2" * {'0'..'4'} * Digit | # 200-249 "25" * {'0'..'5'} # 250-255 reg_name <- *(unreserved | pct_encoded | sub_delims) path <- path_abempty | # begins with "/" or is empty path_absolute | # begins with "/" but not "//" path_noscheme | # begins with a non-colon segment path_rootless | # begins with a segment path_empty # zero characters path_abempty <- >(*( "/" * segment )): userdata.path = $1 path_absolute <- >("/" * ?( segment_nz * *( "/" * segment ) )): userdata.path = $1 path_noscheme <- >(segment_nz_nc * *( "/" * segment )): userdata.path = $1 path_rootless <- >(segment_nz * *( "/" * segment )): userdata.path = $1 path_empty <- 0 segment <- *pchar segment_nz <- +pchar segment_nz_nc <- +( unreserved | pct_encoded | sub_delims | "@" ) # non_zero_length segment without any colon ":" pchar <- unreserved | pct_encoded | sub_delims | ":" | "@" query <- >*( pchar | "|" | "?" ): userdata.query = $1 fragment <- >*( pchar | "|" | "?" ): userdata.fragment = $1 pct_encoded <- "%" * Xdigit * Xdigit unreserved <- Alpha | Digit | "-" | "." | "_" | "~" reserved <- gen_delims | sub_delims gen_delims <- ":" | "|" | "?" | "#" | "[" | "]" | "@" sub_delims <- "!" | "$" | "&" | "'" | "(" | ")" | "*" | "+" | "," | ";" | "=" let urls = @[ "s3://somebucket/somefile.txt", "scheme://user:pass@xn--mgbh0fb.xn--kgbechtv", "scheme://user:pass@host:81/path?query#fragment", "ScheMe://user:pass@HoSt:81/path?query#fragment", "scheme://HoSt:81/path?query#fragment", "scheme://@HoSt:81/path?query#fragment", "scheme://user:pass@host/path?query#fragment", "scheme://user:pass@host:/path?query#fragment", "scheme://host/path?query#fragment", "scheme://10.0.0.2/p?q#f", "scheme://[vAF.1::2::3]/p?q#f", "scheme:path?query#fragment", "scheme:///path?query#fragment", "scheme://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]?query#fragment", "scheme:path#fragment", "scheme:path?#fragment", "ldap://[2001:db8::7]/c=GB?objectClass?one", "http://example.org/hello:12?foo=bar#test", "android-app://org.wikipedia/http/en.m.wikipedia.org/wiki/The_Hitchhiker%27s_Guide_to_the_Galaxy", "ftp://:/p?q#f", "scheme://user:pass@host:000000000081/path?query#fragment", "scheme://user:pass@host:81/path?query#fragment", "ScheMe://user:pass@HoSt:81/path?query#fragment", "scheme://HoSt:81/path?query#fragment", "scheme://@HoSt:81/path?query#fragment", "scheme://user:pass@host/path?query#fragment", "scheme://user:pass@host:/path?query#fragment", "scheme://user:pass@host/path?query#fragment", "scheme://host/path?query#fragment", "scheme://10.0.0.2/p?q#f", "scheme:path?query#fragment", "scheme:///path?query#fragment", "scheme://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]?query#fragment", "scheme:path#fragment", "scheme:path?#fragment", "tel:05000", "scheme:path#", "https://thephpleague.com./p?#f", "http://a_.!~*\'(-)n0123Di%25%26:pass;:&=+$,word@www.zend.com", "http://", "http:::/path", "ldap://[2001:db8::7]/c=GB?objectClass?one", "http://example.org/hello:12?foo=bar#test", "android-app://org.wikipedia/http/en.m.wikipedia.org/wiki/The_Hitchhiker%27s_Guide_to_the_Galaxy", "scheme://user:pass@xn--mgbh0fb.xn--kgbechtv", "http://download.linuxjournal.com/pdf/get-doc.php?code=2c230d54e20e7cb595c660da48be7622&tcode=epub-301-" ] for s in urls: var uri: Uri let r = p.match(s, uri) if not r.ok: echo s quit 1 ================================================ FILE: tests/lexparse.nim ================================================ import npeg, strutils, sequtils, unittest type Token* = enum tInt tAdd cAddExpr Node = ref object case kind: Token of tInt: intVal: int of tAdd: discard of cAddExpr: l, r: Node State = ref object tokens: seq[Node] stack: seq[Node] # Npeg uses `==` to check if a subject matches a literal proc `==`(n: Node, t: Token): bool = n.kind == t proc `$`(n: Node): string = case n.kind of tInt: return $n.intVal of tAdd: return "+" of cAddExpr: return "(" & $n.l & " + " & $n.r & ")" let lexer = peg(tokens, st: State): s <- *Space tokens <- s * *(token * s) token <- int | add int <- +Digit: st.tokens.add Node(kind: tInt, intVal: parseInt($0)) add <- '+': st.tokens.add Node(kind: tAdd) let parser = peg(g, Node, st: State): g <- int * *add * !1 int <- [tInt]: st.stack.add $0 add <- [tAdd] * int: st.stack.add Node(kind: cAddExpr, r: st.stack.pop, l: st.stack.pop) suite "lexer/parser": test "run": var st = State() doAssert lexer.match("1 + 2 + 3", st).ok doAssert parser.match(st.tokens, st).ok doAssert $st.stack[0] == "((1 + 2) + 3)" ================================================ FILE: tests/lib.nim ================================================ import unittest import strutils import unicode import npeg import npeg/lib/types import npeg/lib/utf8 {.push warning[Spacing]: off.} suite "unit tests": test "types": doAssert patt(types.uint8).match("0").ok doAssert patt(types.uint8).match("255").ok doAssert not patt(types.uint8).match("256").ok doAssert patt(types.int8).match("-128").ok doAssert patt(types.int8).match("127").ok doAssert not patt(types.int8).match("-129").ok doAssert not patt(types.int8).match("128").ok when defined(cpu64): doAssert patt(types.uint32).match("4294967295").ok doAssert not patt(types.uint32).match("4294967296").ok test "utf8 runes": doAssert patt(utf8.any[4] * !1).match("abcd").ok doAssert patt(utf8.any[4] * !1).match("abcd").ok doAssert patt(utf8.any[4] * !1).match("всех").ok doAssert patt(utf8.any[4] * !1).match("乪乫乬乭").ok test "utf8 character classes": doAssert patt(utf8.upper).match("Ɵ").ok doAssert not patt(utf8.upper).match("ë").ok doAssert not patt(utf8.lower).match("Ɵ").ok doAssert patt(utf8.lower).match("ë").ok ================================================ FILE: tests/nimversion.nim ================================================ import strutils import npeg type NimType = enum Nim, NimSkull Version = object maj, min, rev: int extra: string NimVersion = object typ: NimType version: Version os: string cpu: string date: string git: string boot_switches: seq[string] let p = peg("nimversion", nv: NimVersion): S <- *{' ','\t','\n','\r'} nimversion <- oldnim_version | nimskull_version oldnim_version <- header * S * "Compiled at " * date * S * "Copyright (c) " * +Graph * " by Andreas Rumpf" * S * "git hash:" * S * git * S * "active boot switches:" * S * boot_switches nimskull_version <- header * S * "Source hash: " * git * S * "Source date: " * date header <- typ * S * "Compiler Version" * S * version * S * "[" * os * ":" * S * cpu * "]" * S typ <- typ_nimskull | typ_nim typ_nim <- "Nim": nv.typ = NimType.Nim typ_nimskull <- "Nimskull": nv.typ = NimType.NimSkull int <- +{'0'..'9'} os <- >+Alnum: nv.os = $1 cpu <- >+Alnum: nv.cpu = $1 git <- >+{'0'..'9','a'..'f'}: nv.git = $1 boot_switches <- *(boot_switch * S) boot_switch <- >+Graph: nv.boot_switches.add($1) date <- >+{'0'..'9','-'}: nv.date = $1 version <- >int * "." * >int * "." * >int * ?"-" * >*Graph: nv.version.maj = parseInt($1) nv.version.min = parseInt($2) nv.version.rev = parseInt($3) nv.version.extra = $4 let vnim = """Nim Compiler Version 2.1.1 [Linux: amd64] Compiled at 2024-03-01 Copyright (c) 2006-2024 by Andreas Rumpf git hash: 1e7ca2dc789eafccdb44304f7e42206c3702fc13 active boot switches: -d:release -d:danger """ let vskull = """Nimskull Compiler Version 0.1.0-dev.21234 [linux: amd64] Source hash: 4948ae809f7d84ef6d765111a7cd0c7cf2ae77d2 Source date: 2024-02-18 """ var nv: NimVersion block: let r = p.match(vnim, nv) if r.ok: echo nv.repr block: let r = p.match(vskull, nv) if r.ok: echo nv.repr ================================================ FILE: tests/performance.nim ================================================ import npeg import os import streams import strutils import tables import json import times #import packedjson import osproc let js = execProcess("bzip2 -d < tests/json-32M.bzip2").string let hostname = readFile("/etc/hostname").strip() let expectTime = { "platdoos": { "json": 0.651, "parsejson": 3.962, "words": 0.920, "search": 0.057, "search1": 0.231, "search2": 1.419, "search3": 0.292, }.toTable(), "fe2": { "json": 3.975, "parsejson": 8.739, "words": 2.391, "search": 0.373, "search1": 2.014, "search2": 2.871, "search3": 0.771, }.toTable(), }.toTable() # Wake up the governor a bit var v = 0 for i in 1..100000: for j in 1..1000000: inc v template measureTime*(what: string, code: untyped) = var expect = 0.0 if hostname in expectTime: if what in expectTime[hostname]: expect = expectTime[hostname][what] let start = cpuTime() block: code let duration = cpuTime() - start let perc = 100.0 * duration / expect echo what & ": ", duration.formatFloat(ffDecimal, 3), "s ", perc.formatFloat(ffDecimal, 1), "%" measureTime "json": ## Json parsing with npeg let p = peg JSON: S <- *{' ','\t','\r','\n'} True <- "true" False <- "false" Null <- "null" UnicodeEscape <- 'u' * Xdigit[4] Escape <- '\\' * ({ '"', '\\', '/', 'b', 'f', 'n', 'r', 't' } | UnicodeEscape) StringBody <- *Escape * *( +( {'\x20'..'\xff'} - {'"'} - {'\\'}) * *Escape) String <- '"' * StringBody * '"': discard Minus <- '-' IntPart <- '0' | {'1'..'9'} * *{'0'..'9'} FractPart <- "." * +{'0'..'9'} ExpPart <- ( 'e' | 'E' ) * ?( '+' | '-' ) * +{'0'..'9'} Number <- ?Minus * IntPart * ?FractPart * ?ExpPart: discard DOC <- Value * !1 ObjPair <- S * String * S * ":" * Value Object <- '{' * ( ObjPair * *( "," * ObjPair ) | S ) * "}" Array <- "[" * ( Value * *( "," * Value ) | S ) * "]" Value <- S * ( Number | String | Object | Array | True | False | Null ) * S JSON <- Value * !1 for i in 1..10: doAssert p.match(js).ok let s = newStringStream(js) measureTime "parsejson": # JSon parsing with nims 'parsejson' module. for i in 1..10: s.setPosition(0) var p: JsonParser open(p, s, "json") while true: p.next() if p.kind == jsonError or p.kind == jsonEof: break measureTime "words": var v = 0 let p = peg foo: foo <- +word word <- @>+Alpha: inc v discard p.match(js).ok measureTime "search": # Search using built in search operator var v = 0 let p = peg search: search <- @"CALIFORNIA": inc v for i in 1..10: discard p.match(js).ok measureTime "search1": # Searches using tail recursion. let p = peg SS: SS <- +S S <- "CALIFORNIA" | 1 * S for i in 1..10: discard p.match(js).ok measureTime "search2": # Searches using an explicit let p = peg SS: SS <- +S S <- *( !"CALIFORNIA" * 1) * "CALIFORNIA" for i in 1..10: discard p.match(js).ok measureTime "search3": # using an optimization to skip false starts. let p = peg SS: SS <- +S S <- "CALIFORNIA" | 1 * *(1-'C') * S for i in 1..10: discard p.match(js).ok ================================================ FILE: tests/precedence.nim ================================================ import unittest import strutils import math import tables import npeg {.push warning[Spacing]: off.} suite "precedence operator": # The PEG below implements a Pratt parser. The ^ and ^^ operators are used to # implement precedence climbing, this allows rules to be left recursive while # still avoiding unbound recursion. # # The parser local state `seq[int]` is used as a stack to store captures and # intermediate results while parsing, the end result of the expression will # be available in element 0 when the parser finishes test "expr evaluator": # Table of binary operators - this maps the operator string to a proc # performing the operation: template map(op: untyped): untyped = (proc(a, b: int): int = op(a, b)) var binOps = { "+": map(`+`), "-": map(`-`), "*": map(`*`), "/": map(`/%`), "^": map(`^`), }.toTable() let p = peg(exp, st: seq[int]): S <- *Space # Capture a number and put it on the stack number <- >+Digit * S: st.add parseInt($1) # Reset the precedence level to 0 when parsing sub-expressions # in parentheses parenExp <- ( "(" * exp * ")" ) ^ 0 # Unary minues: take last element of the stack, negate and push back uniMinus <- '-' * exp: st.add(-st.pop) # The prefix is a number, a sub expression in parentheses or the unary # `-` operator. prefix <- number | parenExp | uniMinus # Parse an infix operator. Bounded by the precedece operator that makes # sure `exp` is only parsed if the currrent precedence is lower then the # given precedence. Note that the power operator has right assosiativity. infix <- >{'+','-'} * exp ^ 1 | >{'*','/'} * exp ^ 2 | >{'^'} * exp ^^ 3 : # Takes two results off the stack, applies the operator and push # back the result let (f2, f1) = (st.pop, st.pop) st.add binOps[$1](f1, f2) # An expression consists of a prefix followed by zero or more infix # operators exp <- S * prefix * *infix # Evaluate the given expression proc eval(expr: string): int = var st: seq[int] doAssert p.match(expr, st).ok st[0] # Test cases doAssert eval("2+1") == 2+1 doAssert eval("(((2+(1))))") == 2+1 doAssert eval("3+2") == 3+2 doAssert eval("3+2+4") == 3+2+4 doAssert eval("(3+2)+4") == 3+2+4 doAssert eval("3+(2+4)") == 3+2+4 doAssert eval("(3+2+4)") == 3+2+4 doAssert eval("3*2*4") == 3*2*4 doAssert eval("(3*2)*4") == 3*2*4 doAssert eval("3*(2*4)") == 3*2*4 doAssert eval("(3*2*4)") == 3*2*4 doAssert eval("3-2-4") == 3-2-4 doAssert eval("(3-2)-4") == (3-2)-4 doAssert eval("3-(2-4)") == 3-(2-4) doAssert eval("(3-2-4)") == 3-2-4 doAssert eval("3/8/4") == 3/%8/%4 doAssert eval("(3/8)/4") == (3/%8)/%4 doAssert eval("3/(8/4)") == 3/%(8/%4) doAssert eval("(3/8/4)") == 3/%8/%4 doAssert eval("(3*8/4)") == 3*8/%4 doAssert eval("(3/8*4)") == 3/%8*4 doAssert eval("3*(8/4)") == 3*(8/%4) doAssert eval("4^3^2") == 4^3^2 doAssert eval("(4^3)^2") == (4^3)^2 doAssert eval("4^(3^2)") == 4^(3^2) ================================================ FILE: tests/testdata ================================================ one=1,two=2,three=3,four=4 ================================================ FILE: tests/tests.nim ================================================ include "basics.nim" include "examples.nim" include "captures.nim" include "precedence.nim" include "lib.nim" include "lexparse.nim"