Repository: shixzie/nlp Branch: master Commit: 39fec05b9991 Files: 12 Total size: 57.2 KB Directory structure: gitextract_2oze1ahc/ ├── .gitignore ├── .travis.yml ├── Gopkg.toml ├── LICENSE ├── Makefile ├── README.md ├── benchmark_test.go ├── nlp.go ├── nlp_test.go └── parser/ ├── nlp.peg ├── parser.go └── parser_test.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # dependencies vendor ================================================ FILE: .travis.yml ================================================ language: go go: - 1.8.x - 1.9.x - tip before_install: - go get -u github.com/golang/dep/cmd/dep - dep ensure script: - go test -v -race -coverprofile=coverage.txt -covermode=atomic after_success: - bash <(curl -s https://codecov.io/bash) ================================================ FILE: Gopkg.toml ================================================ required = [ "github.com/mna/pigeon" ] ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright (c) 2017 Juan Álvarez / @Shixzie Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ help: @echo "deps -> Get all dependencies" @echo "parser -> Generates the sample parser" @echo "tests -> Run all tests" deps: @go get -u github.com/golang/dep/cmd/dep @dep ensure parser: @pigeon -o "./parser/parser.go" "./parser/nlp.peg" tests: @go test -v -race ./... ================================================ FILE: README.md ================================================ [![GoDoc](https://godoc.org/github.com/shixzie/nlp?status.svg)](https://godoc.org/github.com/shixzie/nlp) [![Go Report Card](https://goreportcard.com/badge/github.com/shixzie/nlp)](https://goreportcard.com/report/github.com/shixzie/nlp) [![Build Status](https://travis-ci.org/shixzie/nlp.svg?branch=master)](https://travis-ci.org/shixzie/nlp) [![codecov](https://codecov.io/gh/shixzie/nlp/branch/master/graph/badge.svg)](https://codecov.io/gh/shixzie/nlp) # nlp > `nlp` is a general purpose any-lang Natural Language Processor that parses the data inside a text and returns a filled model ## Supported types ```go int int8 int16 int32 int64 uint uint8 uint16 uint32 uint64 float32 float64 string time.Time time.Duration ``` ## Installation ``` // go1.8+ is required go get -u github.com/shixzie/nlp ``` **Feel free to create PR's and open Issues :)** ## How it works You will always begin by creating a NL type calling nlp.New(), the NL type is a Natural Language Processor that owns 3 funcs, RegisterModel(), Learn() and P(). ### RegisterModel(i interface{}, samples []string, ops ...ModelOption) error RegisterModel takes 3 parameters, an empty struct, a set of samples and some options for the model. The empty struct lets nlp know all possible values inside the text, for example: ```go type Song struct { Name string // fields must be exported Artist string ReleasedAt time.Time } err := nl.RegisterModel(Song{}, someSamples, nlp.WithTimeFormat("2006")) if err != nil { panic(err) } // ... ``` tells nlp that inside the text may be a Song.Name, a Song.Artist and a Song.ReleasedAt. The samples are the key part about nlp, not just because they set the *limits* between *keywords* but also because they will be used to choose which model use to handle an expression. Samples must have a special syntax to set those *limits* and *keywords*. ```go songSamples := []string{ "play {Name} by {Artist}", "play {Name} from {Artist}", "play {Name}", "from {Artist} play {Name}", "play something from {ReleasedAt}", } ``` In the example below, you can see we're reffering to the Name and Artist fields of the `Song` type declared above, both `{Name}` and `{Artist}` are our *keywords* and yes! you guessed it! Everything between `play` and `by` will be treated as a `{Name}`, and everything that's after `by` will be treated as an `{Artist}` meaning that `play` and `by` are our *limits*. ``` limits ┌─────┴─────┐ ┌┴─┐ ┌┴┐ play {Name} by {Artist} └─┬──┘ └───┬──┘ └──────┬─────┘ keywords ``` Any character can be a *limit*, a `,` for example can be used as a limit. *keywords* as well as *limits* are `CaseSensitive` so be sure to type them right. **Note that putting 2 *keywords* together will cause that only 1 or none of them will be detected** > *limits are important* - Me :3 ### Learn() error Learn maps all models samples to their respective models using the NaiveBayes algorithm based on those samples. `Learn()` also trains all registered models so they're able to fit expressions in the future. ```go // must call after all models are registrated and before calling nl.P() err := nl.Learn() if err != nil { panic(err) } // ... ``` Once the algorithm has finished learning, we're now ready to start Processing those texts. **Note that you must call NL.Learn() after all models are registrated and before calling NL.P()** ### P(expr string) interface{} P first asks the trained algorithm which model should be used, once we get the right *and already trained* model, we just make it fit the expression. **Note that everything in the expression must be separated by a _space_ or _tab_** When processing an expression, nlp searches for the *limits* inside that expression and evaluates which sample fits better the expression, it doesn't matter if the text has `trash`. In this example: ``` limits ┌─────┴─────┐ ┌┴─┐ ┌┴┐ play {Name} by {Artist} └─┬──┘ └───┬──┘ └──────┬─────┘ keywords ``` we have 2 *limits*, `play` and `by`, it doesn't matter if we had an expression *hello sir can you pleeeeeease play King by Lauren Aquilina*, since: ``` limits trash ┌────┴────┐ ┌─────────────┴─────────────┐ ┌┴─┐ ┌┴┐ hello sir can you pleeeeeease play King by Lauren Aquilina └┬─┘ └─────┬───────┘ {Name} {Artist} └─┬──┘ └───┬──┘ └──────┬───────┘ keywords ``` `{Name}` would be replaced with `King`, `{Artist}` would be replaced with `Lauren Aquilina`, `trash` would be ignored as well as the *limits* `play` and `by`, and then **a pointer to a filled struct with the type used to register the model** (`Song`) ( `Song.Name` being `{Name}` and `Song.Artist` beign `{Artist}` ) **will be returned**. ## Usage ```go type Song struct { Name string Artist string ReleasedAt time.Time } songSamples := []string{ "play {Name} by {Artist}", "play {Name} from {Artist}", "play {Name}", "from {Artist} play {Name}", "play something from {ReleasedAt}", } nl := nlp.New() err := nl.RegisterModel(Song{}, songSamples, nlp.WithTimeFormat("2006")) if err != nil { panic(err) } err = nl.Learn() // you must call Learn after all models are registered and before calling P if err != nil { panic(err) } // after learning you can call P the times you want s := nl.P("hello sir can you pleeeeeease play King by Lauren Aquilina") if song, ok := s.(*Song); ok { fmt.Println("Success") fmt.Printf("%#v\n", song) } else { fmt.Println("Failed") } // Prints // // Success // &main.Song{Name: "King", Artist: "Lauren Aquilina"} ``` ================================================ FILE: benchmark_test.go ================================================ package nlp import ( "testing" "time" ) func BenchmarkNL_P(b *testing.B) { type T struct { String string Int int Uint uint Float float32 Time time.Time Dur time.Duration } tSamples := []string{ "string {String}", "int {Int}", "uint {Uint}", "float {Float}", "time {Time}", "dur {Dur}", "string {String} int {Int}", "string {String} time {Time}", } nl := New() nl.RegisterModel(T{}, tSamples) err := nl.RegisterModel(T{}, tSamples) if err != nil { b.Error(err) } err = nl.Learn() if err != nil { b.Error(err) } tim, err := time.ParseInLocation("01-02-2006_3:04pm", "05-18-1999_6:42pm", time.Local) if err != nil { b.Error(err) } dur, err := time.ParseDuration("4h2m") if err != nil { b.Error(err) } cases := []struct { name string expression string want interface{} }{ { "string", "string Hello World", "Hello World", }, { "int", "int 42", int(42), }, { "uint", "uint 43", uint(43), }, { "float", "float 44", float32(44), }, { "time", "time 05-18-1999_6:42pm", tim, }, { "duration", "dur 4h2m", dur, }, { "string int", "string Lmao int 42", &T{ String: "Lmao", Int: 42, }, }, { "string time", "string What's Up Boy time 05-18-1999_6:42pm", &T{ String: "What's Up Boy", Time: tim, }, }, } for _, c := range cases { b.Run(c.name, func(b *testing.B) { nl.P(c.expression) }) } } ================================================ FILE: nlp.go ================================================ // Package nlp provides general purpose Natural Language Processing. package nlp import ( "bytes" "errors" "fmt" "reflect" "strconv" "time" "unicode" "github.com/cdipaolo/goml/base" "github.com/cdipaolo/goml/text" "github.com/shixzie/nlp/parser" ) // NL is a Natural Language Processor type NL struct { models []*model naive *text.NaiveBayes // Output contains the training output for the // NaiveBayes algorithm Output *bytes.Buffer } // New returns a *NL func New() *NL { return &NL{Output: bytes.NewBufferString("")} } // P proccesses the expr and returns one of // the types passed as the i parameter to the RegistryModel // func filled with the data inside expr func (nl *NL) P(expr string) interface{} { return nl.models[nl.naive.Predict(expr)].fit(expr) } // Learn maps the models samples to the models themselves and // returns an error if something occurred while learning func (nl *NL) Learn() error { if len(nl.models) > 0 { stream := make(chan base.TextDatapoint) errors := make(chan error) nl.naive = text.NewNaiveBayes(stream, uint8(len(nl.models)), base.OnlyWordsAndNumbers) nl.naive.Output = nl.Output go nl.naive.OnlineLearn(errors) for i := range nl.models { err := nl.models[i].learn() if err != nil { return fmt.Errorf("model#%d %v", i, err) } for _, s := range nl.models[i].samples { stream <- base.TextDatapoint{ X: string(s), Y: uint8(i), } } } close(stream) for { err := <-errors if err != nil { return fmt.Errorf("error occurred while learning: %s", err) } // training is done! break } return nil } return fmt.Errorf("register at least one model before learning") } type model struct { tpy reflect.Type fields []field expected [][]item samples [][]byte timeFormat string timeLocation *time.Location } type item struct { limit bool value []byte field field } type field struct { index int name string kind interface{} } // ModelOption is an option for a specific model type ModelOption func(*model) error // WithTimeFormat sets the format used in time.Parse(format, val), // note that format can't contain any spaces, the default is 01-02-2006_3:04pm func WithTimeFormat(format string) ModelOption { return func(m *model) error { for _, v := range format { if unicode.IsSpace(v) { return errors.New("time format can't contain any spaces") } } m.timeFormat = format return nil } } // WithTimeLocation sets the location used in time.ParseInLocation(format, value, loc), // the default is time.Local func WithTimeLocation(loc *time.Location) ModelOption { return func(m *model) error { if loc == nil { return errors.New("time location can't be nil") } m.timeLocation = loc return nil } } // RegisterModel registers a model i and creates possible patterns // from samples, the default layout when parsing time is 01-02-2006_3:04pm // and the default location is time.Local. // Samples must have special formatting: // // "play {Name} by {Artist}" func (nl *NL) RegisterModel(i interface{}, samples []string, ops ...ModelOption) error { if i == nil { return fmt.Errorf("can't create model from nil value") } if len(samples) == 0 { return fmt.Errorf("samples can't be nil or empty") } tpy, val := reflect.TypeOf(i), reflect.ValueOf(i) if tpy.Kind() == reflect.Struct { mod := &model{ tpy: tpy, expected: make([][]item, len(samples)), timeFormat: "01-02-2006_3:04pm", timeLocation: time.Local, } mod.setSamples(samples) for _, op := range ops { err := op(mod) if err != nil { return err } } NextField: for i := 0; i < tpy.NumField(); i++ { if tpy.Field(i).Anonymous || tpy.Field(i).PkgPath != "" { continue NextField } if v, ok := val.Field(i).Interface().(time.Time); ok { mod.fields = append(mod.fields, field{i, tpy.Field(i).Name, v}) continue NextField } else if v, ok := val.Field(i).Interface().(time.Duration); ok { mod.fields = append(mod.fields, field{i, tpy.Field(i).Name, v}) continue NextField } switch val.Field(i).Kind() { case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Float32, reflect.Float64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.String: mod.fields = append(mod.fields, field{i, tpy.Field(i).Name, val.Field(i).Kind()}) } } nl.models = append(nl.models, mod) return nil } return fmt.Errorf("can't create model from non-struct type") } func (m *model) learn() error { for sid, s := range m.samples { tokens, err := parser.ParseSample(sid, s) if err != nil { return err } var exps []item var hasAtLeastOneKey bool l := len(tokens) for i, tk := range tokens { if tk.Kw { hasAtLeastOneKey = true mistypedField := true for _, f := range m.fields { if string(tk.Val) == f.name { mistypedField = false exps = append(exps, item{field: f, value: tk.Val}) } } if mistypedField { return fmt.Errorf("sample#%d: mistyped field %q", sid, tk.Val) } } else { if i+1 < l { if tokens[i+1].Kw { exps = append(exps, item{limit: true, value: tk.Val}) continue } } } } if !hasAtLeastOneKey { return fmt.Errorf("sample#%d: need at least one keyword", sid) } m.expected[sid] = exps } return nil } func (m *model) selectBestSample(expr []byte) []item { // slice [sample_id]score scores := make([]int, len(m.samples)) tokens, _ := parser.ParseSample(0, expr) mapping := make([][]item, len(m.samples)) limitsOrder := make([][][]byte, len(m.samples)+1) for sid, exps := range m.expected { var currentVal [][]byte var reading bool var lastToken int expecteds: for _, e := range exps { // fmt.Printf("expecting: %s - limit: %v\n", e.value, e.limit) if e.limit { reading = false limitsOrder[sid+1] = append(limitsOrder[sid+1], e.value) } else { reading = true } // fmt.Printf("reading: %v\n", reading) for i := lastToken; i < len(tokens); i++ { t := tokens[i] // fmt.Printf("token: %s - isLimit: %v\n", t.Val, m.isLimit(t.Val, sid)) if m.isLimit(t.Val, sid) { if sid == 0 { limitsOrder[0] = append(limitsOrder[0], t.Val) } scores[sid]++ if len(currentVal) > 0 { // fmt.Printf("appending: %s {%v}\n", bytes.Join(currentVal, []byte{' '}), e.field.name) mapping[sid] = append(mapping[sid], item{field: e.field, value: bytes.Join(currentVal, []byte{' '})}) currentVal = currentVal[:0] lastToken = i continue expecteds } lastToken = i + 1 continue expecteds } else { if reading { // fmt.Printf("adding: %s\n", t.Val) currentVal = append(currentVal, t.Val) } } } if len(currentVal) > 0 { // fmt.Printf("appending: %s {%v}\n", bytes.Join(currentVal, []byte{' '}), e.field.name) mapping[sid] = append(mapping[sid], item{field: e.field, value: bytes.Join(currentVal, []byte{' '})}) } } // fmt.Printf("\n\n") } order: for i := 1; i < len(limitsOrder); i++ { if len(limitsOrder[0]) < len(limitsOrder[i]) { continue order } for j := range limitsOrder[i] { if !bytes.Equal(limitsOrder[i][j], limitsOrder[0][j]) { continue order } } scores[i-1]++ } // fmt.Printf("orders: %s\n\n", limitsOrder) // fmt.Printf("scores: %v\n", scores) bestMapping := selectBestMapping(scores) if bestMapping == -1 { return nil } return mapping[bestMapping] } func selectBestMapping(scores []int) int { bestScore, bestMapping := -1, -1 for id, score := range scores { if score > bestScore { bestScore = score bestMapping = id } } return bestMapping } func (m *model) fit(expr string) interface{} { val := reflect.New(m.tpy) if len(expr) == 0 { return val.Interface() } exps := m.selectBestSample([]byte(expr)) if len(exps) > 0 { for _, e := range exps { switch t := e.field.kind.(type) { case reflect.Kind: switch t { case reflect.String: val.Elem().Field(e.field.index).SetString(string(e.value)) case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: v, _ := strconv.ParseUint(string(e.value), 10, 0) val.Elem().Field(e.field.index).SetUint(v) case reflect.Float32, reflect.Float64: v, _ := strconv.ParseFloat(string(e.value), 64) val.Elem().Field(e.field.index).SetFloat(v) case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: v, _ := strconv.ParseInt(string(e.value), 10, 0) val.Elem().Field(e.field.index).SetInt(v) } case time.Time: v, _ := time.ParseInLocation(m.timeFormat, string(e.value), m.timeLocation) val.Elem().Field(e.field.index).Set(reflect.ValueOf(v)) case time.Duration: v, _ := time.ParseDuration(string(e.value)) val.Elem().Field(e.field.index).Set(reflect.ValueOf(v)) } } } return val.Interface() } // isLimit returns true if s is a limit on expected[id] func (m *model) isLimit(s []byte, id int) bool { for _, e := range m.expected[id] { if bytes.Equal(e.value, s) { return true } } return false } // setSample converts the []string samples to [][]byte func (m *model) setSamples(samples []string) { for _, s := range samples { m.samples = append(m.samples, []byte(s)) } } ================================================ FILE: nlp_test.go ================================================ package nlp import ( "bytes" "reflect" "testing" "time" "github.com/cdipaolo/goml/text" ) func failTest(t *testing.T, err error) { if err != nil { t.Error(err) } } func TestNL_P(t *testing.T) { type T struct { String string Int int Uint uint Float float32 Time time.Time Dur time.Duration } tSamples := []string{ "string {String}", "int {Int}", "uint {Uint}", "float {Float}", "time {Time}", "dur {Dur}", "string {String} int {Int}", "string {String} time {Time}", "need {String} since {Time}", } nl := New() err := nl.RegisterModel(T{}, tSamples) failTest(t, err) err = nl.Learn() failTest(t, err) tim, err := time.ParseInLocation("01-02-2006_3:04pm", "05-18-1999_6:42pm", time.Local) failTest(t, err) dur, err := time.ParseDuration("4h2m") failTest(t, err) cases := []struct { name string expression string want *T }{ 0: { "string", "string Hello World", &T{String: "Hello World"}, }, 1: { "int", "int 42", &T{Int: 42}, }, 2: { "uint", "uint 43", &T{Uint: 43}, }, 3: { "float", "float 44", &T{Float: 44}, }, 4: { "time", "time 05-18-1999_6:42pm", &T{Time: tim}, }, 5: { "duration", "dur 4h2m", &T{Dur: dur}, }, 6: { "string int", "string Lmao int 42", &T{ String: "Lmao", Int: 42, }, }, 7: { "string time", "string What's Up Boy time 05-18-1999_6:42pm", &T{ String: "What's Up Boy", Time: tim, }, }, 8: { "word string time", "Hi, I am Patrice, I need Issue#4 since 05-18-1999_6:42pm", &T{ String: "Issue#4", Time: tim, }, }, } for i, tt := range cases { t.Run(tt.name, func(t *testing.T) { if res := nl.P(tt.expression); !reflect.DeepEqual(res, tt.want) { t.Errorf("test#%d: got %v want %v", i, res, tt.want) } }) } } func TestNL_RegisterModel(t *testing.T) { type fields struct { models []*model naive *text.NaiveBayes Output *bytes.Buffer } type args struct { i interface{} samples []string ops []ModelOption } type T struct { unexported int Time time.Time } tests := []struct { name string fields fields args args wantErr bool }{ { "nil struct", fields{}, args{nil, nil, nil}, true, }, { "nil samples", fields{}, args{args{}, nil, nil}, true, }, { "non-struct", fields{}, args{[]int{}, []string{""}, nil}, true, }, { "unexported & time.Time", fields{}, args{T{}, []string{""}, nil}, false, }, { "options", fields{}, args{T{}, []string{""}, []ModelOption{ WithTimeFormat("02-01-2006"), WithTimeLocation(time.Local), }}, false, }, } for i, tt := range tests { t.Run(tt.name, func(t *testing.T) { nl := &NL{ models: tt.fields.models, naive: tt.fields.naive, Output: tt.fields.Output, } if err := nl.RegisterModel(tt.args.i, tt.args.samples, tt.args.ops...); (err != nil) != tt.wantErr { t.Errorf("[%d] NL.RegisterModel() error = %v, wantErr %v", i, err, tt.wantErr) } }) } } func TestNL_Learn(t *testing.T) { type fields struct { models []*model naive *text.NaiveBayes Output *bytes.Buffer } type T struct { Name string } tests := []struct { name string fields fields wantErr bool }{ { "no models", fields{}, true, }, { "empty model sample", fields{ models: []*model{ { samples: [][]byte{{}}, }, }, Output: bytes.NewBufferString(""), }, true, }, { "mistyped field", fields{ models: []*model{ { samples: [][]byte{[]byte("Hello {Namee}")}, }, }, Output: bytes.NewBufferString(""), }, true, }, { "sample with no keys", fields{ models: []*model{ { samples: [][]byte{[]byte("Hello")}, }, }, Output: bytes.NewBufferString(""), }, true, }, } for i, tt := range tests { t.Run(tt.name, func(t *testing.T) { nl := &NL{ models: tt.fields.models, naive: tt.fields.naive, Output: tt.fields.Output, } if err := nl.Learn(); (err != nil) != tt.wantErr { t.Errorf("[%d] NL.Learn() error = %v, wantErr %v", i, err, tt.wantErr) } }) } } func TestWithTimeFormat(t *testing.T) { type args struct { format string m *model } tests := []struct { name string args args wantErr bool }{ { "invalid format", args{"2006 01 02", &model{}}, true, }, { "valid format", args{"2006", &model{}}, false, }, } for i, tt := range tests { t.Run(tt.name, func(t *testing.T) { op := WithTimeFormat(tt.args.format) if err := op(tt.args.m); (err != nil) != tt.wantErr { t.Errorf("[%d] WithTimeFormat() error = %v, wantErr %v", i, err, tt.wantErr) } }) } } func TestWithTimeLocation(t *testing.T) { type args struct { loc *time.Location m *model } tests := []struct { name string args args wantErr bool }{ { "invalid location", args{nil, &model{}}, true, }, { "valid format", args{time.Local, &model{}}, false, }, } for i, tt := range tests { t.Run(tt.name, func(t *testing.T) { op := WithTimeLocation(tt.args.loc) if err := op(tt.args.m); (err != nil) != tt.wantErr { t.Errorf("[%d] WithTimeFormat() error = %v, wantErr %v", i, err, tt.wantErr) } }) } } ================================================ FILE: parser/nlp.peg ================================================ { // Package parser contains the sample parser for nlp package parser import "fmt" import "errors" // Token is a sample token type Token struct { Kw bool Val []byte } // ParseSample will return the tokens within the sample func ParseSample(sampleID int, sample []byte) ([]Token, error) { samplename := fmt.Sprintf("sample#%d", sampleID) tokens, err := Parse(samplename, sample) var errs errList if err != nil { list := err.(errList) for _, err := range list { pe := err.(*parserError) errs.add(fmt.Errorf("%s: %v", samplename, pe.Inner)) } return nil, errs } return tokens.([]Token), nil } } Sample "sample" = vs:(Identifier / Keyword / Spacing)* { if len(vs.([]interface{})) == 0 { return nil, errors.New("empty sample") } var tokens []Token for _, v := range vs.([]interface{}) { switch tk := v.(type) { case Token: tokens = append(tokens, tk) default: } } return tokens, nil } Keyword "keyword" = '{' Spacing+ v:Identifier '}' { return Token{Kw: true, Val: v.(Token).Val}, nil } / '{' v:Identifier Spacing+ '}' { return Token{Kw: true, Val: v.(Token).Val}, nil } / '{' Spacing+ v:Identifier Spacing+ '}' { return Token{Kw: true, Val: v.(Token).Val}, nil } / '{' v:Identifier '}' { return Token{Kw: true, Val: v.(Token).Val}, nil } Punct "punct" = [^a-zA-Z0-9{} ]+ { return Token{Val: c.text}, nil } Identifier "identifier" = Punct / [^{} \t\r\n]+ { return Token{Val: c.text}, nil } Spacing "spacing" = Space+ / _+ Space "Space" = ' ' _ "whitespace" = [\t\r\n] ================================================ FILE: parser/parser.go ================================================ // Package parser contains the sample parser for nlp package parser import ( "bytes" "errors" "fmt" "io" "io/ioutil" "math" "os" "sort" "strings" "unicode" "unicode/utf8" ) // Token is a sample token type Token struct { Kw bool Val []byte } // ParseSample will return the tokens within the sample func ParseSample(sampleID int, sample []byte) ([]Token, error) { samplename := fmt.Sprintf("sample#%d", sampleID) tokens, err := Parse(samplename, sample) var errs errList if err != nil { list := err.(errList) for _, err := range list { pe := err.(*parserError) errs.add(fmt.Errorf("%s: %v", samplename, pe.Inner)) } return nil, errs } return tokens.([]Token), nil } var g = &grammar{ rules: []*rule{ { name: "Sample", displayName: "\"sample\"", pos: position{line: 32, col: 1, offset: 685}, expr: &actionExpr{ pos: position{line: 33, col: 3, offset: 703}, run: (*parser).callonSample1, expr: &labeledExpr{ pos: position{line: 33, col: 3, offset: 703}, label: "vs", expr: &zeroOrMoreExpr{ pos: position{line: 33, col: 6, offset: 706}, expr: &choiceExpr{ pos: position{line: 33, col: 7, offset: 707}, alternatives: []interface{}{ &ruleRefExpr{ pos: position{line: 33, col: 7, offset: 707}, name: "Identifier", }, &ruleRefExpr{ pos: position{line: 33, col: 20, offset: 720}, name: "Keyword", }, &ruleRefExpr{ pos: position{line: 33, col: 30, offset: 730}, name: "Spacing", }, }, }, }, }, }, }, { name: "Keyword", displayName: "\"keyword\"", pos: position{line: 48, col: 1, offset: 1050}, expr: &choiceExpr{ pos: position{line: 49, col: 3, offset: 1070}, alternatives: []interface{}{ &actionExpr{ pos: position{line: 49, col: 3, offset: 1070}, run: (*parser).callonKeyword2, expr: &seqExpr{ pos: position{line: 49, col: 3, offset: 1070}, exprs: []interface{}{ &litMatcher{ pos: position{line: 49, col: 3, offset: 1070}, val: "{", ignoreCase: false, }, &oneOrMoreExpr{ pos: position{line: 49, col: 7, offset: 1074}, expr: &ruleRefExpr{ pos: position{line: 49, col: 7, offset: 1074}, name: "Spacing", }, }, &labeledExpr{ pos: position{line: 49, col: 16, offset: 1083}, label: "v", expr: &ruleRefExpr{ pos: position{line: 49, col: 18, offset: 1085}, name: "Identifier", }, }, &litMatcher{ pos: position{line: 49, col: 29, offset: 1096}, val: "}", ignoreCase: false, }, }, }, }, &actionExpr{ pos: position{line: 52, col: 3, offset: 1158}, run: (*parser).callonKeyword10, expr: &seqExpr{ pos: position{line: 52, col: 3, offset: 1158}, exprs: []interface{}{ &litMatcher{ pos: position{line: 52, col: 3, offset: 1158}, val: "{", ignoreCase: false, }, &labeledExpr{ pos: position{line: 52, col: 7, offset: 1162}, label: "v", expr: &ruleRefExpr{ pos: position{line: 52, col: 9, offset: 1164}, name: "Identifier", }, }, &oneOrMoreExpr{ pos: position{line: 52, col: 20, offset: 1175}, expr: &ruleRefExpr{ pos: position{line: 52, col: 20, offset: 1175}, name: "Spacing", }, }, &litMatcher{ pos: position{line: 52, col: 29, offset: 1184}, val: "}", ignoreCase: false, }, }, }, }, &actionExpr{ pos: position{line: 55, col: 3, offset: 1246}, run: (*parser).callonKeyword18, expr: &seqExpr{ pos: position{line: 55, col: 3, offset: 1246}, exprs: []interface{}{ &litMatcher{ pos: position{line: 55, col: 3, offset: 1246}, val: "{", ignoreCase: false, }, &oneOrMoreExpr{ pos: position{line: 55, col: 7, offset: 1250}, expr: &ruleRefExpr{ pos: position{line: 55, col: 7, offset: 1250}, name: "Spacing", }, }, &labeledExpr{ pos: position{line: 55, col: 16, offset: 1259}, label: "v", expr: &ruleRefExpr{ pos: position{line: 55, col: 18, offset: 1261}, name: "Identifier", }, }, &oneOrMoreExpr{ pos: position{line: 55, col: 29, offset: 1272}, expr: &ruleRefExpr{ pos: position{line: 55, col: 29, offset: 1272}, name: "Spacing", }, }, &litMatcher{ pos: position{line: 55, col: 38, offset: 1281}, val: "}", ignoreCase: false, }, }, }, }, &actionExpr{ pos: position{line: 58, col: 3, offset: 1343}, run: (*parser).callonKeyword28, expr: &seqExpr{ pos: position{line: 58, col: 3, offset: 1343}, exprs: []interface{}{ &litMatcher{ pos: position{line: 58, col: 3, offset: 1343}, val: "{", ignoreCase: false, }, &labeledExpr{ pos: position{line: 58, col: 7, offset: 1347}, label: "v", expr: &ruleRefExpr{ pos: position{line: 58, col: 9, offset: 1349}, name: "Identifier", }, }, &litMatcher{ pos: position{line: 58, col: 20, offset: 1360}, val: "}", ignoreCase: false, }, }, }, }, }, }, }, { name: "Punct", displayName: "\"punct\"", pos: position{line: 63, col: 1, offset: 1422}, expr: &actionExpr{ pos: position{line: 64, col: 3, offset: 1438}, run: (*parser).callonPunct1, expr: &oneOrMoreExpr{ pos: position{line: 64, col: 3, offset: 1438}, expr: &charClassMatcher{ pos: position{line: 64, col: 3, offset: 1438}, val: "[^a-zA-Z0-9{} ]", chars: []rune{'{', '}', ' '}, ranges: []rune{'a', 'z', 'A', 'Z', '0', '9'}, ignoreCase: false, inverted: true, }, }, }, }, { name: "Identifier", displayName: "\"identifier\"", pos: position{line: 69, col: 1, offset: 1496}, expr: &choiceExpr{ pos: position{line: 70, col: 3, offset: 1522}, alternatives: []interface{}{ &ruleRefExpr{ pos: position{line: 70, col: 3, offset: 1522}, name: "Punct", }, &actionExpr{ pos: position{line: 70, col: 11, offset: 1530}, run: (*parser).callonIdentifier3, expr: &oneOrMoreExpr{ pos: position{line: 70, col: 11, offset: 1530}, expr: &charClassMatcher{ pos: position{line: 70, col: 11, offset: 1530}, val: "[^{} \\t\\r\\n]", chars: []rune{'{', '}', ' ', '\t', '\r', '\n'}, ignoreCase: false, inverted: true, }, }, }, }, }, }, { name: "Spacing", displayName: "\"spacing\"", pos: position{line: 74, col: 1, offset: 1584}, expr: &choiceExpr{ pos: position{line: 75, col: 3, offset: 1604}, alternatives: []interface{}{ &oneOrMoreExpr{ pos: position{line: 75, col: 3, offset: 1604}, expr: &ruleRefExpr{ pos: position{line: 75, col: 3, offset: 1604}, name: "Space", }, }, &oneOrMoreExpr{ pos: position{line: 75, col: 12, offset: 1613}, expr: &ruleRefExpr{ pos: position{line: 75, col: 12, offset: 1613}, name: "_", }, }, }, }, }, { name: "Space", displayName: "\"Space\"", pos: position{line: 77, col: 1, offset: 1617}, expr: &litMatcher{ pos: position{line: 78, col: 3, offset: 1633}, val: " ", ignoreCase: false, }, }, { name: "_", displayName: "\"whitespace\"", pos: position{line: 80, col: 1, offset: 1638}, expr: &charClassMatcher{ pos: position{line: 81, col: 3, offset: 1655}, val: "[\\t\\r\\n]", chars: []rune{'\t', '\r', '\n'}, ignoreCase: false, inverted: false, }, }, }, } func (c *current) onSample1(vs interface{}) (interface{}, error) { if len(vs.([]interface{})) == 0 { return nil, errors.New("empty sample") } var tokens []Token for _, v := range vs.([]interface{}) { switch tk := v.(type) { case Token: tokens = append(tokens, tk) default: } } return tokens, nil } func (p *parser) callonSample1() (interface{}, error) { stack := p.vstack[len(p.vstack)-1] _ = stack return p.cur.onSample1(stack["vs"]) } func (c *current) onKeyword2(v interface{}) (interface{}, error) { return Token{Kw: true, Val: v.(Token).Val}, nil } func (p *parser) callonKeyword2() (interface{}, error) { stack := p.vstack[len(p.vstack)-1] _ = stack return p.cur.onKeyword2(stack["v"]) } func (c *current) onKeyword10(v interface{}) (interface{}, error) { return Token{Kw: true, Val: v.(Token).Val}, nil } func (p *parser) callonKeyword10() (interface{}, error) { stack := p.vstack[len(p.vstack)-1] _ = stack return p.cur.onKeyword10(stack["v"]) } func (c *current) onKeyword18(v interface{}) (interface{}, error) { return Token{Kw: true, Val: v.(Token).Val}, nil } func (p *parser) callonKeyword18() (interface{}, error) { stack := p.vstack[len(p.vstack)-1] _ = stack return p.cur.onKeyword18(stack["v"]) } func (c *current) onKeyword28(v interface{}) (interface{}, error) { return Token{Kw: true, Val: v.(Token).Val}, nil } func (p *parser) callonKeyword28() (interface{}, error) { stack := p.vstack[len(p.vstack)-1] _ = stack return p.cur.onKeyword28(stack["v"]) } func (c *current) onPunct1() (interface{}, error) { return Token{Val: c.text}, nil } func (p *parser) callonPunct1() (interface{}, error) { stack := p.vstack[len(p.vstack)-1] _ = stack return p.cur.onPunct1() } func (c *current) onIdentifier3() (interface{}, error) { return Token{Val: c.text}, nil } func (p *parser) callonIdentifier3() (interface{}, error) { stack := p.vstack[len(p.vstack)-1] _ = stack return p.cur.onIdentifier3() } var ( // errNoRule is returned when the grammar to parse has no rule. errNoRule = errors.New("grammar has no rule") // errInvalidEncoding is returned when the source is not properly // utf8-encoded. errInvalidEncoding = errors.New("invalid encoding") // errMaxExprCnt is used to signal that the maximum number of // expressions have been parsed. errMaxExprCnt = errors.New("max number of expresssions parsed") ) // Option is a function that can set an option on the parser. It returns // the previous setting as an Option. type Option func(*parser) Option // MaxExpressions creates an Option to stop parsing after the provided // number of expressions have been parsed, if the value is 0 then the parser will // parse for as many steps as needed (possibly an infinite number). // // The default for maxExprCnt is 0. func MaxExpressions(maxExprCnt uint64) Option { return func(p *parser) Option { oldMaxExprCnt := p.maxExprCnt p.maxExprCnt = maxExprCnt return MaxExpressions(oldMaxExprCnt) } } // Debug creates an Option to set the debug flag to b. When set to true, // debugging information is printed to stdout while parsing. // // The default is false. func Debug(b bool) Option { return func(p *parser) Option { old := p.debug p.debug = b return Debug(old) } } // Memoize creates an Option to set the memoize flag to b. When set to true, // the parser will cache all results so each expression is evaluated only // once. This guarantees linear parsing time even for pathological cases, // at the expense of more memory and slower times for typical cases. // // The default is false. func Memoize(b bool) Option { return func(p *parser) Option { old := p.memoize p.memoize = b return Memoize(old) } } // Recover creates an Option to set the recover flag to b. When set to // true, this causes the parser to recover from panics and convert it // to an error. Setting it to false can be useful while debugging to // access the full stack trace. // // The default is true. func Recover(b bool) Option { return func(p *parser) Option { old := p.recover p.recover = b return Recover(old) } } // GlobalStore creates an Option to set a key to a certain value in // the globalStore. func GlobalStore(key string, value interface{}) Option { return func(p *parser) Option { old := p.cur.globalStore[key] p.cur.globalStore[key] = value return GlobalStore(key, old) } } // ParseFile parses the file identified by filename. func ParseFile(filename string, opts ...Option) (i interface{}, err error) { f, err := os.Open(filename) if err != nil { return nil, err } defer func() { if closeErr := f.Close(); closeErr != nil { err = closeErr } }() return ParseReader(filename, f, opts...) } // ParseReader parses the data from r using filename as information in the // error messages. func ParseReader(filename string, r io.Reader, opts ...Option) (interface{}, error) { b, err := ioutil.ReadAll(r) if err != nil { return nil, err } return Parse(filename, b, opts...) } // Parse parses the data from b using filename as information in the // error messages. func Parse(filename string, b []byte, opts ...Option) (interface{}, error) { return newParser(filename, b, opts...).parse(g) } // position records a position in the text. type position struct { line, col, offset int } func (p position) String() string { return fmt.Sprintf("%d:%d [%d]", p.line, p.col, p.offset) } // savepoint stores all state required to go back to this point in the // parser. type savepoint struct { position rn rune w int } type current struct { pos position // start position of the match text []byte // raw text of the match // the globalStore allows the parser to store arbitrary values globalStore map[string]interface{} } // the AST types... type grammar struct { pos position rules []*rule } type rule struct { pos position name string displayName string expr interface{} } type choiceExpr struct { pos position alternatives []interface{} } type actionExpr struct { pos position expr interface{} run func(*parser) (interface{}, error) } type seqExpr struct { pos position exprs []interface{} } type labeledExpr struct { pos position label string expr interface{} } type expr struct { pos position expr interface{} } type andExpr expr type notExpr expr type zeroOrOneExpr expr type zeroOrMoreExpr expr type oneOrMoreExpr expr type ruleRefExpr struct { pos position name string } type andCodeExpr struct { pos position run func(*parser) (bool, error) } type notCodeExpr struct { pos position run func(*parser) (bool, error) } type litMatcher struct { pos position val string ignoreCase bool } type charClassMatcher struct { pos position val string basicLatinChars [128]bool chars []rune ranges []rune classes []*unicode.RangeTable ignoreCase bool inverted bool } type anyMatcher position // errList cumulates the errors found by the parser. type errList []error func (e *errList) add(err error) { *e = append(*e, err) } func (e errList) err() error { if len(e) == 0 { return nil } e.dedupe() return e } func (e *errList) dedupe() { var cleaned []error set := make(map[string]bool) for _, err := range *e { if msg := err.Error(); !set[msg] { set[msg] = true cleaned = append(cleaned, err) } } *e = cleaned } func (e errList) Error() string { switch len(e) { case 0: return "" case 1: return e[0].Error() default: var buf bytes.Buffer for i, err := range e { if i > 0 { buf.WriteRune('\n') } buf.WriteString(err.Error()) } return buf.String() } } // parserError wraps an error with a prefix indicating the rule in which // the error occurred. The original error is stored in the Inner field. type parserError struct { Inner error pos position prefix string expected []string } // Error returns the error message. func (p *parserError) Error() string { return p.prefix + ": " + p.Inner.Error() } // newParser creates a parser with the specified input source and options. func newParser(filename string, b []byte, opts ...Option) *parser { p := &parser{ filename: filename, errs: new(errList), data: b, pt: savepoint{position: position{line: 1}}, recover: true, cur: current{ globalStore: make(map[string]interface{}), }, maxFailPos: position{col: 1, line: 1}, maxFailExpected: make([]string, 0, 20), } p.setOptions(opts) if p.maxExprCnt == 0 { p.maxExprCnt = math.MaxUint64 } return p } // setOptions applies the options to the parser. func (p *parser) setOptions(opts []Option) { for _, opt := range opts { opt(p) } } type resultTuple struct { v interface{} b bool end savepoint } type parser struct { filename string pt savepoint cur current data []byte errs *errList depth int recover bool debug bool memoize bool // memoization table for the packrat algorithm: // map[offset in source] map[expression or rule] {value, match} memo map[int]map[interface{}]resultTuple // rules table, maps the rule identifier to the rule node rules map[string]*rule // variables stack, map of label to value vstack []map[string]interface{} // rule stack, allows identification of the current rule in errors rstack []*rule // parse fail maxFailPos position maxFailExpected []string maxFailInvertExpected bool // stats and used for stopping the parser // after a maximum number of expressions are parsed exprCnt uint64 // max number of expressions to be parsed maxExprCnt uint64 } // push a variable set on the vstack. func (p *parser) pushV() { if cap(p.vstack) == len(p.vstack) { // create new empty slot in the stack p.vstack = append(p.vstack, nil) } else { // slice to 1 more p.vstack = p.vstack[:len(p.vstack)+1] } // get the last args set m := p.vstack[len(p.vstack)-1] if m != nil && len(m) == 0 { // empty map, all good return } m = make(map[string]interface{}) p.vstack[len(p.vstack)-1] = m } // pop a variable set from the vstack. func (p *parser) popV() { // if the map is not empty, clear it m := p.vstack[len(p.vstack)-1] if len(m) > 0 { // GC that map p.vstack[len(p.vstack)-1] = nil } p.vstack = p.vstack[:len(p.vstack)-1] } func (p *parser) print(prefix, s string) string { if !p.debug { return s } fmt.Printf("%s %d:%d:%d: %s [%#U]\n", prefix, p.pt.line, p.pt.col, p.pt.offset, s, p.pt.rn) return s } func (p *parser) in(s string) string { p.depth++ return p.print(strings.Repeat(" ", p.depth)+">", s) } func (p *parser) out(s string) string { p.depth-- return p.print(strings.Repeat(" ", p.depth)+"<", s) } func (p *parser) addErr(err error) { p.addErrAt(err, p.pt.position, []string{}) } func (p *parser) addErrAt(err error, pos position, expected []string) { var buf bytes.Buffer if p.filename != "" { buf.WriteString(p.filename) } if buf.Len() > 0 { buf.WriteString(":") } buf.WriteString(fmt.Sprintf("%d:%d (%d)", pos.line, pos.col, pos.offset)) if len(p.rstack) > 0 { if buf.Len() > 0 { buf.WriteString(": ") } rule := p.rstack[len(p.rstack)-1] if rule.displayName != "" { buf.WriteString("rule " + rule.displayName) } else { buf.WriteString("rule " + rule.name) } } pe := &parserError{Inner: err, pos: pos, prefix: buf.String(), expected: expected} p.errs.add(pe) } func (p *parser) failAt(fail bool, pos position, want string) { // process fail if parsing fails and not inverted or parsing succeeds and invert is set if fail == p.maxFailInvertExpected { if pos.offset < p.maxFailPos.offset { return } if pos.offset > p.maxFailPos.offset { p.maxFailPos = pos p.maxFailExpected = p.maxFailExpected[:0] } if p.maxFailInvertExpected { want = "!" + want } p.maxFailExpected = append(p.maxFailExpected, want) } } // read advances the parser to the next rune. func (p *parser) read() { p.pt.offset += p.pt.w rn, n := utf8.DecodeRune(p.data[p.pt.offset:]) p.pt.rn = rn p.pt.w = n p.pt.col++ if rn == '\n' { p.pt.line++ p.pt.col = 0 } if rn == utf8.RuneError { if n == 1 { p.addErr(errInvalidEncoding) } } } // restore parser position to the savepoint pt. func (p *parser) restore(pt savepoint) { if p.debug { defer p.out(p.in("restore")) } if pt.offset == p.pt.offset { return } p.pt = pt } // get the slice of bytes from the savepoint start to the current position. func (p *parser) sliceFrom(start savepoint) []byte { return p.data[start.position.offset:p.pt.position.offset] } func (p *parser) getMemoized(node interface{}) (resultTuple, bool) { if len(p.memo) == 0 { return resultTuple{}, false } m := p.memo[p.pt.offset] if len(m) == 0 { return resultTuple{}, false } res, ok := m[node] return res, ok } func (p *parser) setMemoized(pt savepoint, node interface{}, tuple resultTuple) { if p.memo == nil { p.memo = make(map[int]map[interface{}]resultTuple) } m := p.memo[pt.offset] if m == nil { m = make(map[interface{}]resultTuple) p.memo[pt.offset] = m } m[node] = tuple } func (p *parser) buildRulesTable(g *grammar) { p.rules = make(map[string]*rule, len(g.rules)) for _, r := range g.rules { p.rules[r.name] = r } } func (p *parser) parse(g *grammar) (val interface{}, err error) { if len(g.rules) == 0 { p.addErr(errNoRule) return nil, p.errs.err() } // TODO : not super critical but this could be generated p.buildRulesTable(g) if p.recover { // panic can be used in action code to stop parsing immediately // and return the panic as an error. defer func() { if e := recover(); e != nil { if p.debug { defer p.out(p.in("panic handler")) } val = nil switch e := e.(type) { case error: p.addErr(e) default: p.addErr(fmt.Errorf("%v", e)) } err = p.errs.err() } }() } // start rule is rule [0] p.read() // advance to first rune val, ok := p.parseRule(g.rules[0]) if !ok { if len(*p.errs) == 0 { // If parsing fails, but no errors have been recorded, the expected values // for the farthest parser position are returned as error. maxFailExpectedMap := make(map[string]struct{}, len(p.maxFailExpected)) for _, v := range p.maxFailExpected { maxFailExpectedMap[v] = struct{}{} } expected := make([]string, 0, len(maxFailExpectedMap)) eof := false if _, ok := maxFailExpectedMap["!."]; ok { delete(maxFailExpectedMap, "!.") eof = true } for k := range maxFailExpectedMap { expected = append(expected, k) } sort.Strings(expected) if eof { expected = append(expected, "EOF") } p.addErrAt(errors.New("no match found, expected: "+listJoin(expected, ", ", "or")), p.maxFailPos, expected) } return nil, p.errs.err() } return val, p.errs.err() } func listJoin(list []string, sep string, lastSep string) string { switch len(list) { case 0: return "" case 1: return list[0] default: return fmt.Sprintf("%s %s %s", strings.Join(list[:len(list)-1], sep), lastSep, list[len(list)-1]) } } func (p *parser) parseRule(rule *rule) (interface{}, bool) { if p.debug { defer p.out(p.in("parseRule " + rule.name)) } if p.memoize { res, ok := p.getMemoized(rule) if ok { p.restore(res.end) return res.v, res.b } } start := p.pt p.rstack = append(p.rstack, rule) p.pushV() val, ok := p.parseExpr(rule.expr) p.popV() p.rstack = p.rstack[:len(p.rstack)-1] if ok && p.debug { p.print(strings.Repeat(" ", p.depth)+"MATCH", string(p.sliceFrom(start))) } if p.memoize { p.setMemoized(start, rule, resultTuple{val, ok, p.pt}) } return val, ok } func (p *parser) parseExpr(expr interface{}) (interface{}, bool) { var pt savepoint if p.memoize { res, ok := p.getMemoized(expr) if ok { p.restore(res.end) return res.v, res.b } pt = p.pt } p.exprCnt++ if p.exprCnt > p.maxExprCnt { panic(errMaxExprCnt) } var val interface{} var ok bool switch expr := expr.(type) { case *actionExpr: val, ok = p.parseActionExpr(expr) case *andCodeExpr: val, ok = p.parseAndCodeExpr(expr) case *andExpr: val, ok = p.parseAndExpr(expr) case *anyMatcher: val, ok = p.parseAnyMatcher(expr) case *charClassMatcher: val, ok = p.parseCharClassMatcher(expr) case *choiceExpr: val, ok = p.parseChoiceExpr(expr) case *labeledExpr: val, ok = p.parseLabeledExpr(expr) case *litMatcher: val, ok = p.parseLitMatcher(expr) case *notCodeExpr: val, ok = p.parseNotCodeExpr(expr) case *notExpr: val, ok = p.parseNotExpr(expr) case *oneOrMoreExpr: val, ok = p.parseOneOrMoreExpr(expr) case *ruleRefExpr: val, ok = p.parseRuleRefExpr(expr) case *seqExpr: val, ok = p.parseSeqExpr(expr) case *zeroOrMoreExpr: val, ok = p.parseZeroOrMoreExpr(expr) case *zeroOrOneExpr: val, ok = p.parseZeroOrOneExpr(expr) default: panic(fmt.Sprintf("unknown expression type %T", expr)) } if p.memoize { p.setMemoized(pt, expr, resultTuple{val, ok, p.pt}) } return val, ok } func (p *parser) parseActionExpr(act *actionExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseActionExpr")) } start := p.pt val, ok := p.parseExpr(act.expr) if ok { p.cur.pos = start.position p.cur.text = p.sliceFrom(start) actVal, err := act.run(p) if err != nil { p.addErrAt(err, start.position, []string{}) } val = actVal } if ok && p.debug { p.print(strings.Repeat(" ", p.depth)+"MATCH", string(p.sliceFrom(start))) } return val, ok } func (p *parser) parseAndCodeExpr(and *andCodeExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseAndCodeExpr")) } ok, err := and.run(p) if err != nil { p.addErr(err) } return nil, ok } func (p *parser) parseAndExpr(and *andExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseAndExpr")) } pt := p.pt p.pushV() _, ok := p.parseExpr(and.expr) p.popV() p.restore(pt) return nil, ok } func (p *parser) parseAnyMatcher(any *anyMatcher) (interface{}, bool) { if p.debug { defer p.out(p.in("parseAnyMatcher")) } if p.pt.rn != utf8.RuneError { start := p.pt p.read() p.failAt(true, start.position, ".") return p.sliceFrom(start), true } p.failAt(false, p.pt.position, ".") return nil, false } func (p *parser) parseCharClassMatcher(chr *charClassMatcher) (interface{}, bool) { if p.debug { defer p.out(p.in("parseCharClassMatcher")) } cur := p.pt.rn start := p.pt // can't match EOF if cur == utf8.RuneError { p.failAt(false, start.position, chr.val) return nil, false } if chr.ignoreCase { cur = unicode.ToLower(cur) } // try to match in the list of available chars for _, rn := range chr.chars { if rn == cur { if chr.inverted { p.failAt(false, start.position, chr.val) return nil, false } p.read() p.failAt(true, start.position, chr.val) return p.sliceFrom(start), true } } // try to match in the list of ranges for i := 0; i < len(chr.ranges); i += 2 { if cur >= chr.ranges[i] && cur <= chr.ranges[i+1] { if chr.inverted { p.failAt(false, start.position, chr.val) return nil, false } p.read() p.failAt(true, start.position, chr.val) return p.sliceFrom(start), true } } // try to match in the list of Unicode classes for _, cl := range chr.classes { if unicode.Is(cl, cur) { if chr.inverted { p.failAt(false, start.position, chr.val) return nil, false } p.read() p.failAt(true, start.position, chr.val) return p.sliceFrom(start), true } } if chr.inverted { p.read() p.failAt(true, start.position, chr.val) return p.sliceFrom(start), true } p.failAt(false, start.position, chr.val) return nil, false } func (p *parser) parseChoiceExpr(ch *choiceExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseChoiceExpr")) } for _, alt := range ch.alternatives { p.pushV() val, ok := p.parseExpr(alt) p.popV() if ok { return val, ok } } return nil, false } func (p *parser) parseLabeledExpr(lab *labeledExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseLabeledExpr")) } p.pushV() val, ok := p.parseExpr(lab.expr) p.popV() if ok && lab.label != "" { m := p.vstack[len(p.vstack)-1] m[lab.label] = val } return val, ok } func (p *parser) parseLitMatcher(lit *litMatcher) (interface{}, bool) { if p.debug { defer p.out(p.in("parseLitMatcher")) } ignoreCase := "" if lit.ignoreCase { ignoreCase = "i" } val := fmt.Sprintf("%q%s", lit.val, ignoreCase) start := p.pt for _, want := range lit.val { cur := p.pt.rn if lit.ignoreCase { cur = unicode.ToLower(cur) } if cur != want { p.failAt(false, start.position, val) p.restore(start) return nil, false } p.read() } p.failAt(true, start.position, val) return p.sliceFrom(start), true } func (p *parser) parseNotCodeExpr(not *notCodeExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseNotCodeExpr")) } ok, err := not.run(p) if err != nil { p.addErr(err) } return nil, !ok } func (p *parser) parseNotExpr(not *notExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseNotExpr")) } pt := p.pt p.pushV() p.maxFailInvertExpected = !p.maxFailInvertExpected _, ok := p.parseExpr(not.expr) p.maxFailInvertExpected = !p.maxFailInvertExpected p.popV() p.restore(pt) return nil, !ok } func (p *parser) parseOneOrMoreExpr(expr *oneOrMoreExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseOneOrMoreExpr")) } var vals []interface{} for { p.pushV() val, ok := p.parseExpr(expr.expr) p.popV() if !ok { if len(vals) == 0 { // did not match once, no match return nil, false } return vals, true } vals = append(vals, val) } } func (p *parser) parseRuleRefExpr(ref *ruleRefExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseRuleRefExpr " + ref.name)) } if ref.name == "" { panic(fmt.Sprintf("%s: invalid rule: missing name", ref.pos)) } rule := p.rules[ref.name] if rule == nil { p.addErr(fmt.Errorf("undefined rule: %s", ref.name)) return nil, false } return p.parseRule(rule) } func (p *parser) parseSeqExpr(seq *seqExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseSeqExpr")) } vals := make([]interface{}, 0, len(seq.exprs)) pt := p.pt for _, expr := range seq.exprs { val, ok := p.parseExpr(expr) if !ok { p.restore(pt) return nil, false } vals = append(vals, val) } return vals, true } func (p *parser) parseZeroOrMoreExpr(expr *zeroOrMoreExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseZeroOrMoreExpr")) } var vals []interface{} for { p.pushV() val, ok := p.parseExpr(expr.expr) p.popV() if !ok { return vals, true } vals = append(vals, val) } } func (p *parser) parseZeroOrOneExpr(expr *zeroOrOneExpr) (interface{}, bool) { if p.debug { defer p.out(p.in("parseZeroOrOneExpr")) } p.pushV() val, _ := p.parseExpr(expr.expr) p.popV() // whether it matched or not, consider it a match return val, true } ================================================ FILE: parser/parser_test.go ================================================ package parser import ( "reflect" "testing" ) func TestParseSample(t *testing.T) { type args struct { sampleID int sample []byte } tests := []struct { name string args args want []Token wantErr bool }{ 0: { "err: empty sample", args{0, nil}, nil, true, }, 1: { "normal sample", args{1, []byte("play {Name} from {Artist}")}, []Token{ {Val: []byte("play")}, {Kw: true, Val: []byte("Name")}, {Val: []byte("from")}, {Kw: true, Val: []byte("Artist")}, }, false, }, 2: { "spacing inside keys", args{1, []byte("play { Name} from { Artist }")}, []Token{ {Val: []byte("play")}, {Kw: true, Val: []byte("Name")}, {Val: []byte("from")}, {Kw: true, Val: []byte("Artist")}, }, false, }, 3: { "multi word", args{1, []byte("I need {Name} since {Since}")}, []Token{ {Val: []byte("I")}, {Val: []byte("need")}, {Kw: true, Val: []byte("Name")}, {Val: []byte("since")}, {Kw: true, Val: []byte("Since")}, }, false, }, } for i, tt := range tests { t.Run(tt.name, func(t *testing.T) { got, err := ParseSample(tt.args.sampleID, tt.args.sample) if (err != nil) != tt.wantErr { t.Errorf("Test#%d: ParseSample() error = %v, wantErr %v", i, err, tt.wantErr) return } if !reflect.DeepEqual(got, tt.want) { t.Errorf("Test#%d: ParseSample() = %v, want %v", i, got, tt.want) } }) } }