Repository: chewxy/lingo Branch: master Commit: 491e816b48d4 Files: 128 Total size: 278.9 KB Directory structure: gitextract_whqjv2y6/ ├── .gitignore ├── .travis.yml ├── CONTRIBUTING.md ├── CONTRIBUTORS.md ├── LICENSE ├── POSTag.go ├── POSTag_stanford.go ├── POSTag_stanford_string.go ├── POSTag_universal.go ├── POSTag_universal_string.go ├── README.md ├── annotation.go ├── annotationSet.go ├── annotationSet_bench_test.go ├── browncluster.go ├── cmd/ │ ├── demo/ │ │ ├── io.go │ │ ├── main.go │ │ └── nlp.go │ ├── dep/ │ │ ├── fixer.go │ │ ├── io.go │ │ ├── main.go │ │ ├── pipeline.go │ │ └── train.go │ ├── lexer/ │ │ └── main.go │ └── pos/ │ ├── crossvalidation.go │ ├── fixer.go │ └── main.go ├── const.go ├── corpus/ │ ├── consopt.go │ ├── corpus.go │ ├── corpus_test.go │ ├── functions.go │ ├── functions_test.go │ ├── inflection.go │ ├── inflection_test.go │ ├── io.go │ ├── io_test.go │ ├── lda.go │ ├── test_test.go │ └── utils.go ├── dep/ │ ├── README.md │ ├── arcStandard.go │ ├── arcStandard_test.go │ ├── configuration.go │ ├── configuration_test.go │ ├── debug.go │ ├── dependencyParser.go │ ├── documentation/ │ │ ├── iamhuman.dot │ │ └── thecatsatonthemat.dot │ ├── errors.go │ ├── evaluation.go │ ├── example.go │ ├── example_test.go │ ├── featureExtraction.go │ ├── features.go │ ├── features_string.go │ ├── fix.go │ ├── init.go │ ├── models.go │ ├── models_test.go │ ├── move.go │ ├── move_string.go │ ├── nn2.go │ ├── nn2_io.go │ ├── nn2_io_test.go │ ├── nn2_test.go │ ├── nnconfig.go │ ├── release.go │ ├── span.go │ ├── test_test.go │ ├── train.go │ ├── train_test.go │ ├── transition.go │ └── util.go ├── dependency.go ├── dependencyTree.go ├── dependencyType.go ├── dependencyType_stanford.go ├── dependencyType_stanford_string.go ├── dependencyType_universal.go ├── dependencyType_universal_string.go ├── errors.go ├── go.mod ├── go.sum ├── interfaces.go ├── io.go ├── io_test.go ├── lexeme.go ├── lexemetype_string.go ├── lexer/ │ ├── lexer.go │ ├── lexer_test.go │ └── stateFn.go ├── lingo.go ├── pos/ │ ├── allinone_test.go │ ├── context.go │ ├── context_test.go │ ├── contexttype_string.go │ ├── debug.go │ ├── errors.go │ ├── features.go │ ├── features_test.go │ ├── featuretype_string.go │ ├── models.go │ ├── models_test.go │ ├── perceptron.go │ ├── perceptron_io.go │ ├── perceptron_io_test.go │ ├── postagger.go │ ├── release.go │ ├── sentence.go │ ├── test_test.go │ ├── util.go │ └── util_test.go ├── sentence.go ├── sets.go ├── shape.go ├── stopwords.go ├── treebank/ │ ├── const_postag_stanford.go │ ├── const_postag_universal.go │ ├── const_rel_stanford.go │ ├── const_rel_universal.go │ ├── sentenceTag.go │ ├── sentenceTag_test.go │ ├── treebank.go │ ├── treebank_test.go │ └── util.go ├── utils.go └── wordFlags.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Compiled Object files, Static and Dynamic libs (Shared Objects) *.o *.a *.so # Folders _obj _test # Architecture specific extensions/prefixes *.[568vq] [568vq].out *.cgo1.go *.cgo2.c _cgo_defun.c _cgo_gotypes.go _cgo_export.* _testmain.go *.exe *.test *.prof ================================================ FILE: .travis.yml ================================================ language: go branches: only: - master go: - 1.11.x - 1.12.x - 1.13.x - tip env: - GO111MODULE=on matrix: allow_failures: - go: tip ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing # Contributors are welcome! We want to make contributing as easy as possible, and the process is very Github-centric. [Github Issues](https://github.com/chewxy/lingo/issues) are used to manage any contributions and changes. If you don't have a github account, please feel free to email me (my user name [at] gmail.com), and I'll gladly open an issue on your behalf. # Process # Say you have a change you want to make, this is the process: 1. Open an issue. 2. I'll have a brief discussion with you. If you don't feel comfortable with a public discussion, I'm okay to email. 3. Fork this project on Github, and clone it to your local machine. 4. Make your changes 5. Make sure you have tests. If you foresee breaking any API, it is vital that it be discussed beforehand. 6. Make sure your tests pass. 7. `gofmt` your code 8. Send a Pull Request. Say you instead saw one of the [many issues](https://github.com/chewxy/lingo/issues) and want to solve one of them. This is the process: 1. Comment on the issue saying you'll pick it up. (Alternatively, email me) 2. Fork the project on Github, clone to your local drive. 3. Fork this project on Github, and clone it to your local machine. 4. Make your changes 5. Make sure you have tests. If you foresee breaking any API, it is vital that it be discussed beforehand. 6. Make sure your tests pass. 7. `gofmt` your code 8. Send a Pull Request. ## Pull Requests ## I'll review every pull request. I may request some changes, or delve into further discussions. After that, once I'm satisfied everything passes, I'll merge the pull request. Then I'll add your name into the CONTRIBUTORS list. # Debugging # This package comes with a debug tag option. Most subpackages will have a `debug.go` which contain a `logf` function for logging any traces you wish to trace. ================================================ FILE: CONTRIBUTORS.md ================================================ # Contributors # * Xuanyi Chew (@chewxy) - initial package ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2017 Chewxy Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: POSTag.go ================================================ package lingo import ( "fmt" "strings" ) // POSTag represents a Part of Speech Tag. type POSTag byte var posTagLookup map[string]POSTag func init() { posTagLookup = make(map[string]POSTag) for t := X; t < MAXTAG; t++ { s := t.String() posTagLookup[s] = POSTag(t) posTagLookup[strings.ToLower(s)] = POSTag(t) } } func (p POSTag) MarshalText() ([]byte, error) { return []byte(fmt.Sprintf("%v", p)), nil // add quotes back } func (p *POSTag) UnmarshalText(text []byte) error { str := strings.Trim(string(text), `"`) // for JSON use, if any tag, _ := posTagLookup[str] *p = tag return nil } // POSTag related functions func InPOSTags(x POSTag, set []POSTag) bool { for _, v := range set { if v == x { return true } } return false } func IsAdjective(x POSTag) bool { return InPOSTags(x, Adjectives) } func IsNoun(x POSTag) bool { return InPOSTags(x, Nouns) } func IsProperNoun(x POSTag) bool { return InPOSTags(x, ProperNouns) } func IsVerb(x POSTag) bool { return InPOSTags(x, Verbs) } func IsAdverb(x POSTag) bool { return InPOSTags(x, Adverbs) } func IsInterrogative(x POSTag) bool { return InPOSTags(x, Interrogatives) } func IsDeterminer(x POSTag) bool { return InPOSTags(x, Determiners) } func IsNumber(x POSTag) bool { return InPOSTags(x, Numbers) } func IsSymbol(x POSTag) bool { return InPOSTags(x, Symbols) } ================================================ FILE: POSTag_stanford.go ================================================ // +build stanfordtags package lingo //go:generate stringer -type=POSTag -output=POSTag_stanford_string.go const BUILD_TAGSET = "stanfordtags" const ( X POSTag = iota // aka NULLTAG UNKNOWN_TAG // Unknown ROOT_TAG // For Root CC // Coordinating conjunction CD // Cardinal number DT // Determiner EX // Existential there FW // Foreign word IN // Preposition or subordinating conjunction JJ // Adjective JJR // Adjective, comparative JJS // Adjective, superlative LS // List item marker MD // Modal NN // Noun, singular or mass NNS // Noun, plural NNP // Proper noun, singular NNPS // Proper noun, plural PDT // Predeterminer POS // Possessive ending PRP // Personal pronoun PPRP // Possessive pronoun (PRP$) RB // Adverb RBR // Adverb, comparative RBS // Adverb, superlative RP // Particle SYM // Symbol TO // to UH // Interjection VB // Verb, base form VBD // Verb, past tense VBG // Verb, gerund or present participle VBN // Verb, past participle VBP // Verb, non-3rd person singular present VBZ // Verb, 3rd person singular present WDT // Wh-determiner WP // Wh-pronoun PWP // Possessive wh-pronoun (WP$) WRB // Wh-adverb // Punctuation related stuff: http://stackoverflow.com/a/21546294 COMMA // Obvious isn't it? FULLSTOP // fullstop OPENQUOTE // Penn Treebank uses `` CLOSEQUOTE // Penn Treebank uses '' COLON DOLLAR HASHSIGN LEFTBRACE RIGHTBRACE // Extensions for web shit: https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/etb-supplementary-guidelines-2009-addendum.pdf // http://clear.colorado.edu/compsem/documents/treebank_guidelines.pdf HYPH // Hyphen in split compounds AFX // affix ADD // url or email addy NFP // superfluous (non final) puncutation GW // Goes WIth XX // deidentified data (aka giberish) MAXTAG ) // POSTagShortcut is a shortcut function to help the POSTagger shortcircuit some decisions about what the tag is func POSTagShortcut(l Lexeme) (POSTag, bool) { switch l.LexemeType { case Number: return CD, true case Punctuation: switch l.Value { case ",": return COMMA, true case ".": return FULLSTOP, true case "``": return OPENQUOTE, true case "''": return CLOSEQUOTE, true case ":": return COLON, true case "#": return HASHSIGN, true case "(": return LEFTBRACE, true case ")": return RIGHTBRACE, true default: return X, false } case Symbol: return SYM, true case URI: return ADD, true case Date: return CD, true case Time: return CD, true case EOF: return X, true } return X, false } // sets var Adjectives = []POSTag{JJ, JJR, JJS} var Nouns = []POSTag{NN, NNP, NNS, NNPS} var ProperNouns = []POSTag{NNP, NNPS} var Verbs = []POSTag{VB, VBD, VBG, VBN, VBP, VBZ} var Adverbs = []POSTag{RB, RBR, RBS} var Determiners = []POSTag{DT, PDT} var Interrogatives = []POSTag{WDT, WP, PWP, WRB} var Numbers = []POSTag{CD} var Symbols = []POSTag{SYM, FULLSTOP, COMMA, OPENQUOTE, COLON, DOLLAR, HASHSIGN, LEFTBRACE, RIGHTBRACE, HYPH, NFP} // IsIN returns true if the POSTag is a subordinating conjunction. // The reason why this exists is because in the stanford tag, IN is the POSTag // while in the universal dependencies, it's the SCONJ POSTag func IsIN(x POSTag) bool { return x == IN } ================================================ FILE: POSTag_stanford_string.go ================================================ // +build stanfordtags // Code generated by "stringer -type=POSTag -output=POSTag_stanford_string.go"; DO NOT EDIT package lingo import "fmt" const _POSTag_name = "XUNKNOWN_TAGROOT_TAGCCCDDTEXFWINJJJJRJJSLSMDNNNNSNNPNNPSPDTPOSPRPPPRPRBRBRRBSRPSYMTOUHVBVBDVBGVBNVBPVBZWDTWPPWPWRBCOMMAFULLSTOPOPENQUOTECLOSEQUOTECOLONDOLLARHASHSIGNLEFTBRACERIGHTBRACEHYPHAFXADDNFPGWXXMAXTAG" var _POSTag_index = [...]uint8{0, 1, 12, 20, 22, 24, 26, 28, 30, 32, 34, 37, 40, 42, 44, 46, 49, 52, 56, 59, 62, 65, 69, 71, 74, 77, 79, 82, 84, 86, 88, 91, 94, 97, 100, 103, 106, 108, 111, 114, 119, 127, 136, 146, 151, 157, 165, 174, 184, 188, 191, 194, 197, 199, 201, 207} func (i POSTag) String() string { if i >= POSTag(len(_POSTag_index)-1) { return fmt.Sprintf("POSTag(%d)", i) } return _POSTag_name[_POSTag_index[i]:_POSTag_index[i+1]] } ================================================ FILE: POSTag_universal.go ================================================ // +build !stanfordtags package lingo //go:generate stringer -type=POSTag -output=POSTag_universal_string.go const BUILD_TAGSET = "universaltags" const ( X POSTag = iota // aka NULLTAG UNKNOWN_TAG ROOT_TAG ADJ ADP ADV AUX CONJ DET INTJ NOUN NUM PART PRON PROPN PUNCT SCONJ SYM VERB MAXTAG // MAXTAG is provided here as index support ) // POSTagShortcut is a shortcut function to help the POSTagger shortcircuit some decisions about what the tag is func POSTagShortcut(l Lexeme) (POSTag, bool) { switch l.LexemeType { case Number: return NUM, true case Punctuation: return PUNCT, true case Symbol: return SYM, true case URI: return X, true case Date: return NUM, true case Time: return NUM, true case EOF: return X, true } return X, false } var Adjectives = []POSTag{ADJ} var Nouns = []POSTag{NOUN, PROPN} var ProperNouns = []POSTag{PROPN} var Verbs = []POSTag{VERB} var Adverbs = []POSTag{ADV} var Determiners = []POSTag{DET} var Interrogatives = []POSTag{PRON, DET, ADV} var Numbers = []POSTag{NUM} var Symbols = []POSTag{SYM, PUNCT} // IsIN returns true if the POSTag is a subordinating conjunction. // The reason why this exists is because in the stanford tag, IN is the POSTag // while in the universal dependencies, it's the SCONJ POSTag func IsIN(x POSTag) bool { return x == SCONJ } ================================================ FILE: POSTag_universal_string.go ================================================ // +build !stanfordtags // Code generated by "stringer -type=POSTag -output=POSTag_universal_string.go"; DO NOT EDIT package lingo import "fmt" const _POSTag_name = "XUNKNOWN_TAGROOT_TAGADJADPADVAUXCONJDETINTJNOUNNUMPARTPRONPROPNPUNCTSCONJSYMVERBMAXTAG" var _POSTag_index = [...]uint8{0, 1, 12, 20, 23, 26, 29, 32, 36, 39, 43, 47, 50, 54, 58, 63, 68, 73, 76, 80, 86} func (i POSTag) String() string { if i >= POSTag(len(_POSTag_index)-1) { return fmt.Sprintf("POSTag(%d)", i) } return _POSTag_name[_POSTag_index[i]:_POSTag_index[i+1]] } ================================================ FILE: README.md ================================================ # lingo # [![Build Status](https://travis-ci.org/chewxy/lingo.svg?branch=master)](https://travis-ci.org/chewxy/lingo) package `lingo` provides the data structures and algorithms required for natural language processing. Specifically, it provides a POS Tagger (`lingo/pos`), a Dependency Parser (`lingo/dep`), and a basic tokenizer (`lingo/lexer`) for English. It also provides data structures for holding corpuses (`lingo/corpus`), and treebanks (`lingo/treebank`). The aim of this package is to provide a production quality pipeline for natural language processing. # Install # The package is go-gettable: `go get -u github.com/chewxy/lingo` This package and its subpackages depend on very few external packages. Here they are: | Package | Used For | Vitality | Notes | Licence | |---------|----------|----------|-------|---------| | [gorgonia](https://github.com/chewxy/gorgonia) | Machine learning | Vital. It won't be hard to rewrite them, but why? | Same author | [Gorgonia Licence](https://github.com/chewxy/gorgonia/blob/master/LICENSE) (Apache 2.0-like) | | [gographviz](https://github.com/awalterschulze/gographviz) | Visualization of annotations, and other graph-related visualizations | Vital for visualizations, which are a nice-to-have feature | API last changed 12th April 2017 | [gographviz licence](https://github.com/awalterschulze/gographviz/blob/master/LICENSE) (Apache 2.0) | | [errors](https://github.com/pkg/errors) | Errors | The package won't die without it, but it's a very nice to have | Stable API for the past year | [errors licence](https://github.com/pkg/errors/blob/master/LICENSE) (MIT/BSD like) | | [set](https://github.com/xtgo/set) | Set operations | Can be easily replaced | Stable API for the past year | [set licence](https://github.com/xtgo/set/blob/master/LICENSE) (MIT/BSD-like) | # Usage # See the individual packages for usage. There is also a bunch of executables in the `cmd` directory. They're meant to be examples as to how a natural language processing pipeline can be set up. A natural language pipeline with this package is heavily channels driven. Here's is an example for dependency parsing: ```go func main() { inputString: `The cat sat on the mat` lx := lexer.New("dummy", strings.NewReader(inputString)) // lexer - required to break a sentence up into words. pt := pos.New(pos.WithModel(posModel)) // POS Tagger - required to tag the words with a part of speech tag. dp := dep.New(depModel) // Creates a new parser // set up a pipeline pt.Input = lx.Output dp.Input = pt.Output // run all go lx.Run() go pt.Run() go dp.Run() // wait to receive: for { select { case d := <- dp.Output: // do something case err:= <-dp.Error: // handle error } } } ``` # How It Works # For specific tasks (POS tagging, parsing, named entity recognition etc), refer to the README of each subpackage. This package on its own mainly provides the data structures that the subpackages will use. Perhaps the most important data structure is the `*Annotation` structure. It basically holds a word and the associated metadata for the word. For dependency parses, the graph takes three forms: `*Dependency`, `*DependencyTree` and `*Annotation`. All three forms are convertable from one to another. TODO: explain rationale behind each data type. ## Quirks ## ### Very Oddly Specific POS Tags and Dependency Rel Types ### A particular quirk you may have noticed is that the `POSTag` and `DependencyType` are hard coded in as constants. This package does in fact provide two variations of each: one from Stanford/Penn Treebank and one from [UniversalDependencies](http://universaldependencies.org/). The main reason for hardcoding these are mainly for performance reasons - knowing ahead how much to allocate reduces a lot of additional work the program has to do. It also reduces the chances of mutating a global variable. Of course this comes as a tradeoff - programs are limited to these two options. Thankfully there are only a limited number of POS Tag and Dependency Relation types. Two of the most popular ones (Stanford/PTB and Universal Dependencies) have been implemented. The following build tags are supported: * stanfordtags * universaltags * stanfordrel * universalrel To use a specific tagset or relset, build your program thusly: `go build -tags='stanfordtags'`. The default tag and dependency rel types are the universal dependencies version. ### Lexer ### You should also note that the tokenizer, `lingo/lexer` is not your usual run-of-the-mill NLP tokenizer. It's a tokenizer that tokenizes by space, with some specific rules for English. It was inspired by Rob Pike's talk on lexers. I thought it'd be cool to write something like that for NLP. The test cases in package `lingo/lexer` showcases how it handles unicode, and other pathalogical english. # Contributing # see CONTRIBUTING.md for more info # Licence # This package is licenced under the MIT licence. ================================================ FILE: annotation.go ================================================ package lingo import ( "errors" "fmt" "strings" ) // Annotation is the word and it's metadata. // This includes the position, its dependency head (if available), its lemma, POSTag, etc // // A collection of Annoations - AnnotatedSentence is also a representation of a dependency parse // // Every field is exported for easy gobbing. be very careful with setting stuff type Annotation struct { Lexeme POSTag // NER // fields to do with an annotation being in a collection DependencyType ID int Head *Annotation children AnnotationSet //will not be serialized // info about the annotation itself Lemma string Lowered string Stem string // auxiliary data for processing Cluster Shape WordFlag } func NewAnnotation() *Annotation { return &Annotation{ Lexeme: nullLexeme, Lemma: "", Shape: Shape(""), } } // AnnotationFromLexTag is only ever used in tests. Fixer is optional func AnnotationFromLexTag(l Lexeme, t POSTag, f AnnotationFixer) *Annotation { a := &Annotation{ Lexeme: l, POSTag: t, DependencyType: NoDepType, Lemma: "", Lowered: strings.ToLower(l.Value), } // it's ok to panic - it will cause the tests to fail if err := a.Process(f); err != nil { panic(err) } return a } func (a *Annotation) Clone() *Annotation { b := *a b.ID = -1 b.Head = nil b.children = nil b.DependencyType = NoDepType return &b } func (a *Annotation) SetHead(headAnn *Annotation) { a.Head = headAnn if headAnn != rootAnnotation && headAnn != startAnnotation && headAnn != nullAnnotation { headAnn.children = append(headAnn.children, a) } } func (a *Annotation) HeadID() int { if a.Head != nil { return a.Head.ID } return -1 } func (a *Annotation) IsNumber() bool { return IsNumber(a.POSTag) && (a.LexemeType != Date && a.LexemeType != Time && a.LexemeType != URI) } func (a *Annotation) String() string { return a.Value } func (a *Annotation) GoString() string { s := fmt.Sprintf("%q/%s", a.Lexeme.Value, a.POSTag) if a.Head != nil { return fmt.Sprintf("(%v) <-%v- (%q/%s) ", s, a.DependencyType, a.Head.Value, a.Head.POSTag) } return s } func (a *Annotation) Process(f AnnotationFixer) error { if a.Lexeme != nullLexeme { a.Lowered = strings.ToLower(a.Value) a.Shape = a.Lexeme.Shape() a.WordFlag = a.Lexeme.Flags() var err error if f != nil { var stem string if stem, err = f.Stem(a.Lowered); err != nil { if _, ok := err.(componentUnavailable); !ok { return err } } a.Stem = stem var clust map[string]Cluster if clust, err = f.Clusters(); err == nil { a.Cluster = clust[a.Value] } } return nil } return errors.New("No Lexeme!") } var rootAnnotation = &Annotation{ Lexeme: rootLexeme, POSTag: ROOT_TAG, DependencyType: Root, ID: 0, Head: nil, Lemma: "", Lowered: "", Cluster: 0, Shape: "", WordFlag: NoFlag, } var startAnnotation = &Annotation{ Lexeme: startLexeme, POSTag: ROOT_TAG, DependencyType: NoDepType, ID: -1, Head: nil, Lemma: "", Lowered: "", Cluster: 0, Shape: "", WordFlag: NoFlag, } var nullAnnotation = &Annotation{ Lexeme: nullLexeme, POSTag: X, DependencyType: NoDepType, ID: -1, Head: nil, Lemma: "", Lowered: "", Cluster: 0, Shape: "", WordFlag: NoFlag, } func RootAnnotation() *Annotation { return rootAnnotation } func StartAnnotation() *Annotation { return startAnnotation } func NullAnnotation() *Annotation { return nullAnnotation } func StringToAnnotation(s string, f AnnotationFixer) *Annotation { l := MakeLexeme(s, Word) a := NewAnnotation() a.Lexeme = l if err := a.Process(f); err != nil { panic(err.Error()) } return a } type AnnotationFixer interface { Lemmatizer Stemmer Clusters() (map[string]Cluster, error) } ================================================ FILE: annotationSet.go ================================================ package lingo import ( "sort" "unsafe" "github.com/xtgo/set" ) type AnnotationSet []*Annotation func (as AnnotationSet) Len() int { return len(as) } func (as AnnotationSet) Swap(i, j int) { as[i], as[j] = as[j], as[i] } func (as AnnotationSet) Less(i, j int) bool { return uintptr(unsafe.Pointer(as[i])) < uintptr(unsafe.Pointer(as[j])) } func (as AnnotationSet) Set() AnnotationSet { sort.Sort(as) n := set.Uniq(as) return as[:n] } func (as AnnotationSet) Contains(a *Annotation) bool { if as.Index(a) == len(as) { return false } return true } func (as AnnotationSet) Index(a *Annotation) int { for i, an := range as { if an == a { return i } } return len(as) } func (as AnnotationSet) Add(a *Annotation) AnnotationSet { if as.Contains(a) { return as } as = append(as, a) return as } ================================================ FILE: annotationSet_bench_test.go ================================================ package lingo import ( "sort" "testing" ) func (as AnnotationSet) index2(a *Annotation) int { sort.Sort(as) f := func(i int) bool { return as[i] == a } return sort.Search(len(as), f) } var benchIndexRes int func benchASIndex(size int, b *testing.B) { var as AnnotationSet for i := 0; i < size; i++ { as = append(as, new(Annotation)) } doesntcontain := new(Annotation) contains := as[0] for n := 0; n < b.N; n++ { benchIndexRes = as.Index(doesntcontain) benchIndexRes = as.Index(contains) } } func benchASIndex2(size int, b *testing.B) { var as AnnotationSet for i := 0; i < size; i++ { as = append(as, new(Annotation)) } doesntcontain := new(Annotation) contains := as[0] for n := 0; n < b.N; n++ { benchIndexRes = as.index2(doesntcontain) benchIndexRes = as.index2(contains) } } func BenchmarkAnnotationSetIndex_1(b *testing.B) { benchASIndex(1, b) } func BenchmarkAnnotationSetIndex_2(b *testing.B) { benchASIndex(2, b) } func BenchmarkAnnotationSetIndex_8(b *testing.B) { benchASIndex(8, b) } func BenchmarkAnnotationSetIndex_16(b *testing.B) { benchASIndex(16, b) } func BenchmarkAnnotationSetIndex_32(b *testing.B) { benchASIndex(32, b) } func BenchmarkAnnotationSetIndex_64(b *testing.B) { benchASIndex(64, b) } func BenchmarkAnnotationSetIndex_128(b *testing.B) { benchASIndex(128, b) } func BenchmarkAnnotationSetIndex_256(b *testing.B) { benchASIndex(256, b) } func BenchmarkAnnotationSetIndex_512(b *testing.B) { benchASIndex(512, b) } func BenchmarkAnnotationSetIndex_1024(b *testing.B) { benchASIndex(1024, b) } func BenchmarkAnnotationSetIndex2_1(b *testing.B) { benchASIndex2(1, b) } func BenchmarkAnnotationSetIndex2_2(b *testing.B) { benchASIndex2(2, b) } func BenchmarkAnnotationSetIndex2_8(b *testing.B) { benchASIndex2(8, b) } func BenchmarkAnnotationSetIndex2_16(b *testing.B) { benchASIndex2(16, b) } func BenchmarkAnnotationSetIndex2_32(b *testing.B) { benchASIndex2(32, b) } func BenchmarkAnnotationSetIndex2_64(b *testing.B) { benchASIndex2(64, b) } func BenchmarkAnnotationSetIndex2_128(b *testing.B) { benchASIndex2(128, b) } func BenchmarkAnnotationSetIndex2_256(b *testing.B) { benchASIndex2(256, b) } func BenchmarkAnnotationSetIndex2_512(b *testing.B) { benchASIndex2(512, b) } func BenchmarkAnnotationSetIndex2_1024(b *testing.B) { benchASIndex2(1024, b) } ================================================ FILE: browncluster.go ================================================ package lingo import ( "bufio" "io" "strconv" "strings" ) // this file provides IO support and type safety for brown clusters. // The creation of brownclusters is not done here. // Right now lingo does not generate clusters - use PercyLiang's excellent tool for that // Cluster represents a brown cluster type Cluster int // ReadCluster reads PercyLiang's cluster file format and returns a map of strings to Cluster func ReadCluster(r io.Reader) map[string]Cluster { scanner := bufio.NewScanner(r) clusters := make(map[string]Cluster) for scanner.Scan() { line := scanner.Text() splits := strings.Split(line, "\t") var word string var cluster, freq int word = splits[1] var i64 int64 var err error if i64, err = strconv.ParseInt(splits[0], 2, 64); err != nil { panic(err) } cluster = int(i64) if freq, err = strconv.Atoi(splits[2]); err != nil { panic(err) } // if clusterer has only seen a word a few times, then the cluster is not reliable if freq >= 3 { clusters[word] = Cluster(cluster) } else { clusters[word] = Cluster(0) } } // expand clusters with recasing for word, clust := range clusters { lowered := strings.ToLower(word) if _, ok := clusters[lowered]; !ok { clusters[lowered] = clust } titled := strings.ToTitle(word) if _, ok := clusters[titled]; !ok { clusters[titled] = clust } uppered := strings.ToUpper(word) if _, ok := clusters[uppered]; !ok { clusters[uppered] = clust } } return clusters } ================================================ FILE: cmd/demo/io.go ================================================ package main import ( "log" "os" "github.com/chewxy/lingo" "github.com/chewxy/lingo/dep" "github.com/chewxy/lingo/pos" ) const ( posModelFile = `model/pos_stanfordtags_universalrel.final.model` depModelFile = `model/dep_stanfordtags_universalrel.final.model` brownCluster = `clusters.txt` ) func io() { var err error log.Println("loading POS Tagger model") if posModel, err = pos.Load(posModelFile); err != nil { log.Fatal(err) } log.Println("loading Dependency Parser model") if depModel, err = dep.Load(depModelFile); err != nil { log.Fatal(err) } var f *os.File if f, err = os.Open(brownCluster); err != nil { log.Fatal(err) } clusters = lingo.ReadCluster(f) } ================================================ FILE: cmd/demo/main.go ================================================ package main import ( "io/ioutil" "os" "os/exec" "github.com/abiosoft/ishell" "github.com/chewxy/lingo" "github.com/pkg/browser" ) func main() { io() shell := ishell.New() var d *lingo.Dependency // var sent lingo.AnnotatedSentence var err error shell.AddCmd(&ishell.Cmd{ Name: "dep", Help: "perform dependency parsing", Func: func(c *ishell.Context) { c.ShowPrompt(false) defer c.ShowPrompt(true) c.Print("Query: ") query := c.ReadLine() if d, err = pipeline(query); err != nil { c.Printf("Error: %v", err) } c.Printf("%v\n", d) }, }) shell.AddCmd(&ishell.Cmd{ Name: "show", Help: "show dependency parse on browser", Func: func(c *ishell.Context) { var tmp *os.File if tmp, err = ioutil.TempFile("", "dep"); err != nil { c.Printf("Cannot open file %v\n", err) return } defer os.Remove(tmp.Name()) c.Printf("%v\n", tmp.Name()) dot := d.Tree().Dot() tmp.Write([]byte(dot)) if err := tmp.Close(); err != nil { c.Printf("Error closing file %v", err) } cmd := exec.Command("dot", "-Tpng", "-O", tmp.Name()) if err = cmd.Run(); err != nil { c.Printf("Cannot execute dot: %v\n", err) } browser.OpenFile(tmp.Name() + ".png") }, }) shell.Start() } ================================================ FILE: cmd/demo/nlp.go ================================================ package main import ( "fmt" "strings" "github.com/chewxy/lingo" "github.com/chewxy/lingo/dep" "github.com/chewxy/lingo/lexer" "github.com/chewxy/lingo/pos" "github.com/kljensen/snowball" "github.com/pkg/errors" ) var posModel *pos.Model var depModel *dep.Model var clusters map[string]lingo.Cluster type stemmer struct{} func (stemmer) Stem(a string) (string, error) { return snowball.Stem(a, "english", true) } type fixer struct { stemmer } func (f fixer) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil } func (f fixer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) { return nil, nocomp("lemmatizer") } type nocomp string func (e nocomp) Error() string { return fmt.Sprintf("no %v", string(e)) } func (e nocomp) Component() string { return string(e) } func pipeline(s string) (d *lingo.Dependency, err error) { if posModel == nil || depModel == nil { return nil, errors.Errorf("Unable to create a pipeline") } lx := lexer.New(s, strings.NewReader(s)) pt := pos.New(pos.WithModel(posModel), pos.WithStemmer(stemmer{})) dp := dep.New(depModel) // pipeline pt.Input = lx.Output dp.Input = pt.Output go lx.Run() go pt.Run() go dp.Run() var ok bool for { select { case d, ok = <-dp.Output: if !ok { continue } return case err = <-dp.Error: return } } } ================================================ FILE: cmd/dep/fixer.go ================================================ package main import ( "fmt" "github.com/chewxy/lingo" "github.com/kljensen/snowball" ) type stemmer struct{} func (stemmer) Stem(a string) (string, error) { return snowball.Stem(a, "english", true) } type fixer struct { stemmer } func (f fixer) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil } func (f fixer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) { return nil, nocomp("lemmatizer") } type nocomp string func (e nocomp) Error() string { return fmt.Sprintf("no %v", string(e)) } func (e nocomp) Component() string { return string(e) } ================================================ FILE: cmd/dep/io.go ================================================ package main import ( "log" "github.com/chewxy/lingo/dep" "github.com/chewxy/lingo/pos" "github.com/chewxy/lingo/treebank" ) func validateFlags() { if *load == "" && *trainFile == "" { log.Fatal("Must either load a model or pass in a training file") } if *epoch < 0 { log.Fatal("epochs must only be positive numbers") } if *load != "" { toLoad = true } if *trainFile != "" { toTrain = true } if *testFile != "" { *cv = true } // warnings if *load == "" && *save == "" { log.Println("WARNING: Models that have been trained will NOT be saved") } } func loadTreebanks() { if *trainFile != "" { trainTB = treebank.LoadUniversal(*trainFile) } if *testFile != "" { testTB = treebank.LoadUniversal(*testFile) } } func loadPOSModel() { var err error if *loadPOS == "" { log.Fatal("Cannot proceed without having a POS model") } if POSModel, err = pos.Load(*loadPOS); err != nil { log.Fatal(err) } } func loadDepModel() { var err error if DepModel, err = dep.Load(*load); err != nil { log.Fatal(err) } } func saveModel() { if *save != "" && DepModel != nil { DepModel.Save(*save) } } ================================================ FILE: cmd/dep/main.go ================================================ package main import ( "flag" "log" "os" "os/signal" "runtime/pprof" "syscall" "github.com/chewxy/lingo" "github.com/chewxy/lingo/dep" "github.com/chewxy/lingo/pos" ) var save = flag.String("save", "", "save as...") var load = flag.String("load", "", "load a model") var loadPOS = flag.String("PTmodel", "", "load a POS Tagger model") var clusterFiles = flag.String("cluster", "", "Brown Cluster files. If nothing is passed in, then the brown cluster won't be used") var trainFile = flag.String("train", "", "Training on... (Only CONLLU formatted training files are accepted)") var testFile = flag.String("test", "", "Test on... (Only CONLLU formatted training files are accepted). If this is not provided, the model will be trained without crossvalidation") var cv = flag.Bool("cv", false, "Cross validate training model? Defaults to false.") var epoch = flag.Int("epoch", 10, "Training epochs. Defaults to 10") var format = flag.String("f", "", "Format to output. Default is none. Accepts: {json, dot}") var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file") var memprofile = flag.String("memprofile", "", "write memory profile to this file") var clusters map[string]lingo.Cluster var POSModel *pos.Model var DepModel *dep.Model var toLoad, toTrain bool func init() { if lingo.BUILD_TAGSET != "stanfordtags" && lingo.BUILD_TAGSET != "universaltags" { log.Fatalf("Tagset %q unsupported", lingo.BUILD_TAGSET) } if lingo.BUILD_RELSET != "stanfordrel" && lingo.BUILD_RELSET != "universalrel" { log.Fatalf("Relset %q unsupported", lingo.BUILD_RELSET) } } func cleanup(sigChan chan os.Signal, cpuprofiling, memprofiling bool) { select { case <-sigChan: log.Println("EMERGENCY EXIT") if cpuprofiling { pprof.StopCPUProfile() } if memprofiling { f, err := os.Create(*memprofile) if err != nil { log.Fatal(err) } pprof.WriteHeapProfile(f) f.Close() } saveModel() os.Exit(1) } } func main() { flag.Parse() validateFlags() sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) var cpuprofiling, memprofiling bool if *cpuprofile != "" { f, err := os.Create(*cpuprofile) if err != nil { log.Fatal(err) } cpuprofiling = true pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } if *memprofile != "" { memprofiling = true } go cleanup(sigChan, cpuprofiling, memprofiling) loadPOSModel() if toLoad { loadDepModel() } if toTrain { loadTreebanks() train() } saveModel() } ================================================ FILE: cmd/dep/pipeline.go ================================================ package main import ( "encoding/json" "fmt" "strings" "github.com/chewxy/lingo" "github.com/chewxy/lingo/dep" "github.com/chewxy/lingo/lexer" "github.com/chewxy/lingo/pos" ) func receive(deps chan *lingo.Dependency, errs, errChan chan error) { defer close(errChan) for { select { case dep, ok := <-deps: if !ok { continue } switch *format { case "json": bs, _ := json.MarshalIndent(dep, "", "\t") fmt.Printf("%s\n", string(bs)) case "dot": fmt.Printf("%v\n", dep.Tree().Dot()) } case err := <-errs: errChan <- err } } } func pipeline(s string) error { lx := lexer.New(s, strings.NewReader(s)) pt := pos.New(pos.WithModel(POSModel)) dp := dep.New(DepModel) pt.Input = lx.Output dp.Input = pt.Output errChan := make(chan error) go lx.Run() go pt.Run() go receive(dp.Output, dp.Error, errChan) dp.Run() return <-errChan } ================================================ FILE: cmd/dep/train.go ================================================ package main import ( "log" "github.com/chewxy/lingo/dep" "github.com/chewxy/lingo/treebank" "gorgonia.org/tensor" ) var trainTB []treebank.SentenceTag var testTB []treebank.SentenceTag func train() { conf := dep.DefaultNNConfig conf.Dtype = tensor.Float32 var trainer *dep.Trainer if testTB != nil { log.Printf("TRAINING WITH CROSSVALIDATION") trainer = dep.NewTrainer(dep.WithGeneratedCorpus(trainTB...), dep.WithTrainingSet(trainTB), dep.WithCrossValidationSet(testTB), dep.WithConfig(conf)) trainer.SaveBest = "TMP.model" if err := trainer.Init(); err != nil { log.Fatalf("Unable to initialize trainer: \n%+v", err) } prog := trainer.Perf() cost := trainer.Cost() go func() { for { select { case p := <-prog: log.Printf("%v\n", p) case c := <-cost: log.Printf("Cost %v\n", c) } } }() } else { trainer = dep.NewTrainer(dep.WithGeneratedCorpus(trainTB...), dep.WithTrainingSet(trainTB), dep.WithConfig(conf)) if err := trainer.Init(); err != nil { log.Fatalf("Unable to initialize trainer: \n%+v", err) } prog := trainer.Cost() go func() { for cost := range prog { log.Printf("Cost %v\n", cost) } }() } if err := trainer.Train(*epoch); err != nil { log.Fatal(err) } DepModel = trainer.Model } ================================================ FILE: cmd/lexer/main.go ================================================ package main import ( "flag" "fmt" "strings" "github.com/chewxy/lingo" "github.com/chewxy/lingo/lexer" ) var input = flag.String("input", "", "input string to lex") var output = make(chan lingo.Lexeme) func receieve() { for l := range output { fmt.Printf("%v\n", l) } } func main() { flag.Parse() s := *input go receieve() l := lexer.New(s, strings.NewReader(s)) l.Output = output l.Run() } ================================================ FILE: cmd/pos/crossvalidation.go ================================================ package main import ( "bytes" "fmt" "log" "os" "strings" "sync" "github.com/chewxy/lingo" "github.com/chewxy/lingo/lexer" "github.com/chewxy/lingo/pos" "github.com/chewxy/lingo/treebank" ) type testResult struct { tagged lingo.AnnotatedSentence actual lingo.AnnotatedSentence } func (tr testResult) compare() (int, bool) { tagged := tr.tagged actual := tr.actual var sameLength bool = true if len(tagged) != len(actual) { sameLength = false } var counter int for i, v := range actual { if i >= len(tagged) { break } if v.POSTag == tagged[i].POSTag { counter++ } } return counter, sameLength } func crossValidate(resultChan chan testResult) { diffLengthCount := 0 totalLength := 0 correctCount := 0 sentences := 0 var wrongResults []testResult for res := range resultChan { sentences++ length := len(res.actual) cc, sl := res.compare() if !sl { diffLengthCount++ } correctCount += cc totalLength += length if cc != length && *inspect != "" { wrongResults = append(wrongResults, res) } } if *inspect != "" { f, err := os.OpenFile(*inspect, os.O_WRONLY|os.O_CREATE, 0666) if err != nil { log.Fatal(err) } // can write directly to f var buf bytes.Buffer for _, res := range wrongResults { fmt.Fprintf(&buf, "Sentence: \nW:%v\nG:%v\nTags:\nW: %v\nG: %v\n\n", res.actual.StringSlice(), res.tagged.StringSlice(), res.actual.Tags(), res.tagged.Tags()) } f.WriteString(buf.String()) f.Close() } fmt.Printf("CrossValidation: %d/%d = %f. Differing Lengths : %d/%d = %f\n", correctCount, totalLength, float64(correctCount)/float64(totalLength), diffLengthCount, sentences, float64(diffLengthCount)/float64(sentences)) } func collect(ch chan lingo.AnnotatedSentence, correct lingo.AnnotatedSentence, outCh chan testResult, wg *sync.WaitGroup) { defer wg.Done() for sentence := range ch { outCh <- testResult{sentence, correct} } } func testModel(sentences []treebank.SentenceTag) { resultChan := make(chan testResult) go func() { defer close(resultChan) var wg sync.WaitGroup for _, sentence := range sentences { wg.Add(1) input := sentence.String() correct := sentence.AnnotatedSentence(fixer{stemmer{}}) ch := make(chan lingo.AnnotatedSentence) go collect(ch, correct, resultChan, &wg) go cvpipeline(input, ch) } wg.Wait() }() crossValidate(resultChan) } func cvpipeline(s string, output chan lingo.AnnotatedSentence) { l := lexer.New(s, strings.NewReader(s)) pt := pos.New(pos.WithModel(model)) pt.Input = l.Output pt.Output = output go l.Run() pt.Run() } ================================================ FILE: cmd/pos/fixer.go ================================================ // +build !chewxy package main import ( "fmt" "github.com/chewxy/lingo" "github.com/kljensen/snowball" ) type stemmer struct{} func (stemmer) Stem(a string) (string, error) { return snowball.Stem(a, "english", true) } type fixer struct { stemmer } func (f fixer) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil } func (f fixer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) { return nil, nocomp("lemmatizer") } type nocomp string func (e nocomp) Error() string { return fmt.Sprintf("no %v", string(e)) } func (e nocomp) Component() string { return string(e) } ================================================ FILE: cmd/pos/main.go ================================================ package main import ( "flag" "fmt" "log" "os" "os/signal" "runtime/pprof" "strings" "sync" "syscall" "time" "github.com/chewxy/lingo" "github.com/chewxy/lingo/lexer" "github.com/chewxy/lingo/pos" "github.com/chewxy/lingo/treebank" ) var save = flag.String("save", "", "save as...") var load = flag.String("load", "", "load a model") var clusterFiles = flag.String("cluster", "", "Brown Cluster files. If nothing is passed in, then the brown cluster won't be used") var trainFile = flag.String("train", "", "Training on... files that end with '.conllu' will be treated as CONLLU formatted files. Files ending with '.zip' will be treted as EWT files") var testFile = flag.String("test", "", "Test on... Files to cross validate the model on. If this is provided, automatic crossvalidation will be done") var cv = flag.Bool("cv", false, "Cross validate training model? Defaults to false.") var epoch = flag.Int("epoch", 1500, "Training epochs. Defaults to 1500") var inspect = flag.String("inpect", "", "Inspect all the wrong outputs to figure out what went wrong in the POSTagging. This is useful for debugging") var input = flag.String("input", "", "Input sentence to tag") var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file") var memprofile = flag.String("memprofile", "", "write memory profile to this file") var clusters map[string]lingo.Cluster var model *pos.Model func receive(sentences chan lingo.AnnotatedSentence, wg *sync.WaitGroup) { defer wg.Done() for sent := range sentences { for _, a := range sent { fmt.Printf("%#v: %s| %s | %s | %d\n", a, a.POSTag, a.Lemma, a.WordFlag, a.Cluster) } } } func pipeline(s string) { l := lexer.New(s, strings.NewReader(s)) pt := pos.New(pos.WithModel(model)) pt.Input = l.Output var wg sync.WaitGroup go l.Run() go receive(pt.Output, &wg) wg.Add(1) pt.Run() wg.Wait() } func validateFlags() { if *load == "" && *trainFile == "" { log.Fatal("Must either load a model or pass in a training file") } if *epoch < 0 { log.Fatal("epochs must be positive numbers only!") } if *testFile != "" { *cv = true } // warnings if *load == "" && *save == "" { log.Println("WARNING: Models that are trained will NOT be saved") } } func loadOrTrain() { var trained *pos.Tagger if *clusterFiles != "" { f, err := os.Open(*clusterFiles) if err != nil { log.Fatal(err) } clusters = lingo.ReadCluster(f) trained = pos.New(pos.WithCluster(clusters), pos.WithStemmer(stemmer{})) } else { trained = pos.New() } if *load != "" { start := time.Now() var err error if model, err = pos.Load(*load); err != nil { log.Fatal(err) } log.Printf("Loading model from %q took %v", *load, time.Since(start)) return } var sentences []treebank.SentenceTag switch { case strings.HasSuffix(*trainFile, ".zip"): sentences = treebank.LoadEWT(*trainFile) // TODO split sentences for crossvalidation case strings.HasSuffix(*trainFile, ".conllu"): sentences = treebank.LoadUniversal(*trainFile) default: f, err := os.Open(*trainFile) if err != nil { log.Fatal(err) } sentences = treebank.ReadConllu(f) } log.Printf("Start training for %d epochs...", *epoch) start := time.Now() trained.Train(sentences, *epoch) log.Printf("End Training. Training took %v minutes", time.Since(start).Minutes()) if *save != "" { trained.Save(*save) log.Printf("Model saved as: %v", *save) } } func cleanup(sigChan chan os.Signal, profiling bool) { select { case <-sigChan: log.Println("EMERGENCY EXIT") if profiling { pprof.StopCPUProfile() } os.Exit(1) } } func main() { flag.Parse() if lingo.BUILD_TAGSET != "stanfordtags" && lingo.BUILD_TAGSET != "universaltags" { log.Fatalf("Tagset: %v is unsupported", lingo.BUILD_TAGSET) } sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) var profiling bool if *cpuprofile != "" { f, err := os.Create(*cpuprofile) if err != nil { log.Fatal(err) } profiling = true pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } go cleanup(sigChan, profiling) validateFlags() loadOrTrain() if *memprofile != "" { f, err := os.Create(*memprofile) if err != nil { log.Fatal(err) } pprof.WriteHeapProfile(f) f.Close() } if *input != "" { pipeline(*input) } if *cv { log.Printf("Cross Validating now") testSentences := treebank.LoadUniversal(*testFile) testModel(testSentences) } } ================================================ FILE: const.go ================================================ package lingo // constants that are not pertaining to build tags var empty struct{} // NumberWords was generated with this python code /* numberWords = {} simple = '''zero one two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty'''.split() for i, word in zip(xrange(0, 20+1), simple): numberWords[word] = i tense = '''thirty forty fifty sixty seventy eighty ninety hundred'''.split() for i, word in zip(xrange(30, 100+1, 10), tense): numberWords[word] = i larges = '''thousand million billion trillion quadrillion quintillion sextillion septillion'''.split() for i, word in zip(xrange(3, 24+1, 3), larges): numberWords[word] = 10**i */ var NumberWords = map[string]int{ "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10, "eleven": 11, "twelve": 12, "thirteen": 13, "fourteen": 14, "fifteen": 15, "sixteen": 16, "nineteen": 19, "seventeen": 17, "eighteen": 18, "twenty": 20, "thirty": 30, "forty": 40, "fifty": 50, "sixty": 60, "seventy": 70, "eighty": 80, "ninety": 90, "hundred": 100, "thousand": 1000, "million": 1000000, "billion": 1000000000, "trillion": 1000000000000, "quadrillion": 1000000000000000, // "quintillion": 1000000000000000000, // "sextillion": 1000000000000000000000, // "septillion": 1000000000000000000000000, } ================================================ FILE: corpus/consopt.go ================================================ package corpus import ( "log" "sort" "sync/atomic" "unicode/utf8" "github.com/pkg/errors" "github.com/xtgo/set" ) // ConsOpt is a construction option for manual creation of a Corpus type ConsOpt func(c *Corpus) error // WithWords creates a corpus from a word list. It may have repeated words func WithWords(a []string) ConsOpt { f := func(c *Corpus) error { s := set.Strings(a) c.words = s c.frequencies = make([]int, len(s)) ids := make(map[string]int) maxID := len(s) var totalFreq, maxWL int // NOTE: here we're iterating over the set of words for i, w := range s { runeCount := utf8.RuneCountInString(w) if runeCount > c.maxWordLength { maxWL = runeCount } ids[w] = i } // NOTE: here we're iterating over the original word list. for _, w := range a { c.frequencies[ids[w]]++ totalFreq++ } c.ids = ids atomic.AddInt64(&c.maxid, int64(maxID)) c.totalFreq = totalFreq c.maxWordLength = maxWL return nil } return f } // WithOrderedWords creates a Corpus with the given word order func WithOrderedWords(a []string) ConsOpt { f := func(c *Corpus) error { s := a c.words = s c.frequencies = make([]int, len(s)) for i := range c.frequencies { c.frequencies[i] = 1 } ids := make(map[string]int) maxID := len(s) totalFreq := len(s) var maxWL int for i, w := range a { runeCount := utf8.RuneCountInString(w) if runeCount > c.maxWordLength { maxWL = runeCount } ids[w] = i } c.ids = ids atomic.AddInt64(&c.maxid, int64(maxID)) c.totalFreq = totalFreq c.maxWordLength = maxWL return nil } return f } // WithSize preallocates all the things in Corpus func WithSize(size int) ConsOpt { return func(c *Corpus) error { c.words = make([]string, 0, size) c.frequencies = make([]int, 0, size) return nil } } // FromDict is a construction option to take a map[string]int where the int represents the word ID. // This is useful for constructing corpuses from foreign sources where the ID mappings are important func FromDict(d map[string]int) ConsOpt { return func(c *Corpus) error { var a sortutil for k, v := range d { a.words = append(a.words, k) a.ids = append(a.ids, v) } sort.Sort(&a) c.ids = make(map[string]int) for i, w := range a.words { if i != a.ids[i] { return errors.Errorf("Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!", i, i, a.ids[i]) } c.words = append(c.words, w) c.frequencies = append(c.frequencies, 1) c.ids[w] = i c.totalFreq++ runeCount := utf8.RuneCountInString(w) if runeCount > c.maxWordLength { log.Printf("FD MaxWordLength %d - %q", runeCount, w) c.maxWordLength = runeCount } } c.maxid = int64(len(a.words)) return nil } } // FromDictWithFreq is like FromDict, but also has a frequency. func FromDictWithFreq(d map[string]struct{ ID, Freq int }) ConsOpt { return func(c *Corpus) error { var a sortutil for k, v := range d { a.words = append(a.words, k) a.ids = append(a.ids, v.ID) a.freqs = append(a.freqs, v.Freq) } sort.Sort(&a) c.ids = make(map[string]int) for i, w := range a.words { if i != a.ids[i] { return errors.Errorf("Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!", i, i, a.ids[i]) } c.words = append(c.words, w) c.frequencies = append(c.frequencies, a.freqs[i]) c.ids[w] = i c.totalFreq += a.freqs[i] runeCount := utf8.RuneCountInString(w) if runeCount > c.maxWordLength { c.maxWordLength = runeCount } } c.maxid = int64(len(a.words)) return nil } } ================================================ FILE: corpus/corpus.go ================================================ package corpus import ( "sync/atomic" "unicode/utf8" "github.com/pkg/errors" ) // Corpus is a data structure holding the relevant metadata and information for a corpus of text. // It serves as vocabulary with ID for lookup. This is very useful as neural networks rely on the IDs rather than the text themselves type Corpus struct { words []string frequencies []int ids map[string]int // atomic read and write plz maxid int64 totalFreq int maxWordLength int } // New creates a new *Corpus func New() *Corpus { c := &Corpus{ words: make([]string, 0), frequencies: make([]int, 0), ids: make(map[string]int), } // add some default words c.Add("") // aka NULL - when there are no words c.Add("-UNKNOWN-") c.Add("-ROOT-") c.maxWordLength = 0 // specials don't have lengths return c } // Construct creates a Corpus given the construction options. This allows for more flexibility func Construct(opts ...ConsOpt) (*Corpus, error) { c := new(Corpus) // checks if c.words == nil { c.words = make([]string, 0) } if c.frequencies == nil { c.frequencies = make([]int, 0) } if c.ids == nil { c.ids = make(map[string]int) } for _, opt := range opts { if err := opt(c); err != nil { return nil, err } } return c, nil } // ID returns the ID of a word and whether or not it was found in the corpus func (c *Corpus) Id(word string) (int, bool) { id, ok := c.ids[word] return id, ok } // Word returns the word given the ID, and whether or not it was found in the corpus func (c *Corpus) Word(id int) (string, bool) { size := atomic.LoadInt64(&c.maxid) maxid := int(size) if id >= maxid { return "", false } return c.words[id], true } // Add adds a word to the corpus and returns its ID. If a word was previously in the corpus, it merely updates the frequency count and returns the ID func (c *Corpus) Add(word string) int { if id, ok := c.ids[word]; ok { c.frequencies[id]++ c.totalFreq++ return id } id := atomic.AddInt64(&c.maxid, 1) c.ids[word] = int(id - 1) c.words = append(c.words, word) c.frequencies = append(c.frequencies, 1) c.totalFreq++ runeCount := utf8.RuneCountInString(word) if runeCount > c.maxWordLength { c.maxWordLength = runeCount } return int(id - 1) } // Size returns the size of the corpus. func (c *Corpus) Size() int { size := atomic.LoadInt64(&c.maxid) return int(size) } // WordFreq returns the frequency of the word. If the word wasn't in the corpus, it returns 0. func (c *Corpus) WordFreq(word string) int { id, ok := c.ids[word] if !ok { return 0 } return c.frequencies[id] } // IDFreq returns the frequency of a word given an ID. If the word isn't in the corpus it returns 0. func (c *Corpus) IDFreq(id int) int { size := atomic.LoadInt64(&c.maxid) maxid := int(size) if id >= maxid { return 0 } return c.frequencies[id] } // TotalFreq returns the total number of words ever seen by the corpus. This number includes the count of repeat words. func (c *Corpus) TotalFreq() int { return c.totalFreq } // MaxWordLength returns the length of the longest known word in the corpus. func (c *Corpus) MaxWordLength() int { return c.maxWordLength } // WordProb returns the probability of a word appearing in the corpus. func (c *Corpus) WordProb(word string) (float64, bool) { id, ok := c.Id(word) if !ok { return 0, false } count := c.frequencies[id] return float64(count) / float64(c.totalFreq), true } // Merge combines two corpuses. The receiver is the one that is mutated. func (c *Corpus) Merge(other *Corpus) { for i, word := range other.words { freq := other.frequencies[i] if id, ok := c.ids[word]; ok { c.frequencies[id] += freq c.totalFreq += freq } else { id := c.Add(word) c.frequencies[id] += freq - 1 c.totalFreq += freq - 1 } } } // Replace replaces the content of a word. The old reference remains. // // e.g: c.Replace("foo", "bar") // c.Id("foo") will still return a ID. The ID will be the same as c.Id("bar") func (c *Corpus) Replace(a, with string) error { old, ok := c.ids[a] if !ok { return errors.Errorf("Cannot replace %q with %q. %q is not found", a, with, a) } if _, ok := c.ids[with]; ok { return errors.Errorf("Cannot replace %q with %q. %q exists in the corpus", a, with, with) } c.words[old] = with return nil } // ReplaceWord replaces the word associated with the given ID. The old reference remains. func (c *Corpus) ReplaceWord(id int, with string) error { if id >= len(c.words) { return errors.Errorf("Cannot replace word with ID %d. Out of bounds.", id) } if _, ok := c.ids[with]; ok { return errors.Errorf("Cannot replace word with ID %d with %q. %q exists in the corpus", id, with, with) } c.words[id] = with return nil } ================================================ FILE: corpus/corpus_test.go ================================================ package corpus import ( "testing" "github.com/stretchr/testify/assert" ) func TestCorpus(t *testing.T) { assert := assert.New(t) dict := New() assert.Equal(0, dict.WordFreq("hello")) // frequency of a word not in dict ould have to be 0 assert.Equal(0, dict.IDFreq(3)) // ditto id := dict.Add("hello") assert.Equal(3, id) assert.Equal([]string{"", "-UNKNOWN-", "-ROOT-", "hello"}, dict.words) assert.Equal(map[string]int{"": 0, "-UNKNOWN-": 1, "-ROOT-": 2, "hello": 3}, dict.ids) assert.Equal(4, dict.Size()) id2, ok := dict.Id("hello") if !ok { t.Errorf("The ID of null should be 0") } assert.Equal(id, id2) word, ok := dict.Word(3) if !ok { t.Errorf("Expected word of ID 3 to be found") } assert.Equal("hello", word) dict.Add(word) assert.Equal(2, dict.WordFreq(word)) assert.Equal(2, dict.IDFreq(3)) assert.Equal(5, dict.TotalFreq()) assert.Equal(5, dict.MaxWordLength()) prob, ok := dict.WordProb(word) if !ok { t.Errorf("Expected a probability") } assert.Equal(0.4, prob) // t.Logf("%q: %v", word, dict.WordProb(word)) } func TestCorpus_Merge(t *testing.T) { assert := assert.New(t) dict := New() id := dict.Add("hello") dict.frequencies[id] += 4 // freq for "hello" is 5 dict.totalFreq += 4 other := New() id = other.Add("hello") other.frequencies[id] += 2 // freq for "hello" is 3 other.totalFreq += 2 id = other.Add("world") other.frequencies[id] += 1 other.totalFreq += 1 dict.Merge(other) assert.Equal(8, dict.WordFreq("hello")) assert.Equal(2, dict.WordFreq("world")) } ================================================ FILE: corpus/functions.go ================================================ package corpus import ( "math" "strings" "unicode/utf8" "github.com/chewxy/lingo" "github.com/chewxy/lingo/treebank" "github.com/pkg/errors" ) // GenerateCorpus creates a Corpus given a set of SentenceTag from a training set. func GenerateCorpus(sentenceTags []treebank.SentenceTag) *Corpus { words := make([]string, 3) frequencies := make([]int, 3) words[0] = "" // aka NULL, for when no word can be found frequencies[0] = 0 // no word is never found words[1] = "-UNKNOWN-" frequencies[1] = 0 words[2] = "-ROOT-" frequencies[2] = 1 knownWords := make(map[string]int) knownWords[""] = 0 knownWords["-UNKNOWN-"] = 1 knownWords["-ROOT-"] = 2 maxWordLength := 0 for _, sentenceTag := range sentenceTags { for _, lex := range sentenceTag.Sentence { id, ok := knownWords[lex.Value] if !ok { knownWords[lex.Value] = len(words) words = append(words, lex.Value) frequencies = append(frequencies, 1) runeCount := utf8.RuneCountInString(lex.Value) if runeCount > maxWordLength { maxWordLength = runeCount } } else { frequencies[id]++ } } } var totals int for _, f := range frequencies { totals += f } return &Corpus{words, frequencies, knownWords, int64(len(words)), totals, maxWordLength} } // ViterbiSplit is a Viterbi algorithm for splitting words given a corpus func ViterbiSplit(input string, c *Corpus) []string { s := strings.ToLower(input) probabilities := []float64{1.0} lasts := []int{0} runes := []int{} for i := range s { runes = append(runes, i) } runes = append(runes, len(s)+1) for i := range s { probs := make([]float64, 0) ls := make([]int, 0) // m := maxInt(0, i-c.maxWordLength) for j, r := range runes { if r > i { break } p, ok := c.WordProb(s[r : i+1]) if !ok { // http://stackoverflow.com/questions/195010/how-can-i-split-multiple-joined-words#comment48879458_481773 p = (math.Log(float64(1)/float64(c.totalFreq)) - float64(c.maxWordLength) - float64(1)) * float64(i-r) // note it should be i-r not j-i as per the SO post } prob := probabilities[j] * p probs = append(probs, prob) ls = append(ls, r) } maxProb := -math.SmallestNonzeroFloat64 maxK := -1 << 63 for j, p := range probs { if p > maxProb { maxProb = p maxK = ls[j] } } probabilities = append(probabilities, maxProb) lasts = append(lasts, maxK) } words := make([]string, 0) i := utf8.RuneCountInString(s) for i > 0 { start := lasts[i] words = append(words, s[start:i]) i = start } // reverse it for i, j := 0, len(words)-1; i < j; i, j = i+1, j-1 { words[i], words[j] = words[j], words[i] } return words } // CosineSimilarity measures the cosine similarity of two strings. func CosineSimilarity(a, b []string) float64 { countsA := make([]float64, 0) countsB := make([]float64, 0) uniques := make(map[string]int) // index the strings first for _, st := range a { s := strings.ToLower(st) id, ok := uniques[s] if !ok { uniques[s] = len(countsA) countsA = append(countsA, 1) countsB = append(countsB, 0) // create for countsB, but don't add } else { countsA[id]++ } } for _, st := range b { s := strings.ToLower(st) id, ok := uniques[s] if !ok { uniques[s] = len(countsA) countsA = append(countsA, 0) countsB = append(countsB, 1) } else { countsB[id]++ } } magA, err := mag(countsA) if err != nil { panic(err) } magB, err := mag(countsB) if err != nil { panic(err) } dotProd, err := dot(countsA, countsB) if err != nil { panic(err) } return dotProd / (magA * magB) } // DamerauLevenshtein calculates the Damerau-Levensthtein distance between two strings. See more at https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance func DamerauLevenshtein(s1 string, s2 string) (distance int) { // index by code point, not byte r1 := []rune(s1) r2 := []rune(s2) // the maximum possible distance inf := len(r1) + len(r2) // if one string is blank, we needs insertions // for all characters in the other one if len(r1) == 0 { return len(r2) } if len(r2) == 0 { return len(r1) } // construct the edit-tracking matrix matrix := make([][]int, len(r1)) for i := range matrix { matrix[i] = make([]int, len(r2)) } // seen characters seenRunes := make(map[rune]int) if r1[0] != r2[0] { matrix[0][0] = 1 } seenRunes[r1[0]] = 0 for i := 1; i < len(r1); i++ { deleteDist := matrix[i-1][0] + 1 insertDist := (i+1)*1 + 1 var matchDist int if r1[i] == r2[0] { matchDist = i } else { matchDist = i + 1 } matrix[i][0] = minInt(minInt(deleteDist, insertDist), matchDist) } for j := 1; j < len(r2); j++ { deleteDist := (j + 1) * 2 insertDist := matrix[0][j-1] + 1 var matchDist int if r1[0] == r2[j] { matchDist = j } else { matchDist = j + 1 } matrix[0][j] = minInt(minInt(deleteDist, insertDist), matchDist) } for i := 1; i < len(r1); i++ { var maxSrcMatchIndex int if r1[i] == r2[0] { maxSrcMatchIndex = 0 } else { maxSrcMatchIndex = -1 } for j := 1; j < len(r2); j++ { swapIndex, ok := seenRunes[r2[j]] jSwap := maxSrcMatchIndex deleteDist := matrix[i-1][j] + 1 insertDist := matrix[i][j-1] + 1 matchDist := matrix[i-1][j-1] if r1[i] != r2[j] { matchDist += 1 } else { maxSrcMatchIndex = j } // for transpositions var swapDist int if ok && jSwap != -1 { iSwap := swapIndex var preSwapCost int if iSwap == 0 && jSwap == 0 { preSwapCost = 0 } else { preSwapCost = matrix[maxInt(0, iSwap-1)][maxInt(0, jSwap-1)] } swapDist = i + j + preSwapCost - iSwap - jSwap - 1 } else { swapDist = inf } matrix[i][j] = minInt(minInt(minInt(deleteDist, insertDist), matchDist), swapDist) } seenRunes[r1[i]] = i } return matrix[len(r1)-1][len(r2)-1] } // LongestCommonPrefix takes a slice of strings, and finds the longest common prefix func LongestCommonPrefix(strs ...string) string { switch len(strs) { case 0: return "" // idiots case 1: return strs[0] } min := strs[0] max := strs[0] for _, s := range strs[1:] { switch { case s < min: min = s case s > max: max = s } } for i := 0; i < len(min) && i < len(max); i++ { if min[i] != max[i] { return min[:i] } } // In the case where lengths are not equal but all bytes // are equal, min is the answer ("foo" < "foobar"). return min } /* The following two functions help in parsing a string into numbers. It's recommended you write abstractions over the functions*/ // StrsToInts converts a string slice into an int slice, with the help of NumberWords. // The function assumes all helper words like "and" have been stripped. // "One hundred and five" -> []string{"one", "hundred", "five"} // This is a very primitive method, and doesn't take into account other words like "a hundred" or "a couple of hundred" func StrsToInts(strs []string) (retVal []int, err error) { for _, s := range strs { intVal, ok := lingo.NumberWords[s] if !ok { return nil, errors.Errorf("Unable to parse the words %q as numbers", s) } if len(retVal) > 0 && intVal == 100 && retVal[len(retVal)-1] < 100 { retVal[len(retVal)-1] *= 100 } else if len(retVal) > 0 && retVal[len(retVal)-1] < 1000 && intVal < 1000 { retVal[len(retVal)-1] += intVal } else { retVal = append(retVal, intVal) } } return } // CombineInts takes a int slice, and tries to make it one integer. // It works by taking advantage of english - anything more than 1000 has a repeated pattern // e.g. // one hundred and fifty thousand two hundred and two // there are 2 repeated patterns (one hundred and fifty) and (two hundred and two) // // This allows us to repeatedly combine by addition or multiplication until there is one left func CombineInts(ints []int) int { var total int for len(ints) > 0 { if len(ints) == 1 || ints[0] >= 1000 { last := ints[len(ints)-1] total += last ints = ints[0 : len(ints)-1] //pop it } else { if ints[1] < 1000 { // something went wrong panic("HELP!") } total += ints[0] * ints[1] ints = ints[2:] } } return total } ================================================ FILE: corpus/functions_test.go ================================================ package corpus import ( "strings" "testing" "github.com/stretchr/testify/assert" ) func Test_GenerateCorpus(t *testing.T) { sentenceTags := mediumSentence() dict := GenerateCorpus(sentenceTags) // testing time assert := assert.New(t) expectedWords := []string{"", "-UNKNOWN-", "-ROOT-", "President", "Bush", "on", "Tuesday", "nominated", "two", "individuals", "to", "replace", "retiring", "jurists", "federal", "courts", "in", "the", "Washington", "area", "."} expectedIDs := make(map[string]int) for i, w := range expectedWords { expectedIDs[w] = i } assert.Equal(expectedWords, dict.words, "Corpus known words should be the same as the manually annotated expected values") assert.Equal(expectedIDs, dict.ids, "IDs should be the same as expected IDs") assert.Equal(int64(len(expectedWords)), dict.maxid) } func TestViterbiSplit(t *testing.T) { assert := assert.New(t) dict := GenerateCorpus(mediumSentence()) s2 := "twoindividuals" words := ViterbiSplit(s2, dict) assert.Equal([]string{"two", "individuals"}, words) s2 = "FederalCourts" words = ViterbiSplit(s2, dict) assert.Equal([]string{"federal", "courts"}, words) s3 := "toreplaceon" words = ViterbiSplit(s3, dict) assert.Equal([]string{"to", "replace", "on"}, words) } func TestCosineSimilarity(t *testing.T) { a := strings.Split("This is a test of cosine similarity", " ") b := strings.Split("This is not a test of cosine similarity", " ") s1 := CosineSimilarity(a, a) s2 := CosineSimilarity(a, b) if !floatEquals64(s1, 1) { t.Error("Expected similarity to be 1 when compared with itself") } if s2 > s1 { t.Error("Something went wrong with the cosine similarity algorithm") } c := strings.Split("Parramatta Road", " ") d := strings.Split("Parramatta Rd", " ") s1 = CosineSimilarity(c, c) s2 = CosineSimilarity(c, d) if !floatEquals64(s1, 1) { t.Error("Expected similarity to be 1 when compared with itself") } if s2 > s1 { t.Error("Something went wrong with the cosine similarity algorithm") } } func TestDL(t *testing.T) { a := "This is a test of Damerau Levenshtein" b := "This is not a test of Damerau Levenshtein" s1 := DamerauLevenshtein(a, a) s2 := DamerauLevenshtein(a, b) if s1 != 0 { t.Errorf("Expected the distance to be 0 when compared against itself. Got %d", s1) } if s2 < s1 { t.Error("Expected DL similarity to be greater when compared against itself") } c := "Parramatta Road" d := "Paramatta Rd" s1 = DamerauLevenshtein(c, c) s2 = DamerauLevenshtein(c, d) if s1 != 0 { t.Errorf("Expected the distance to be 0 when compared against itself. Got %d", s1) } if s2 < s1 { t.Error("Expected DL similarity to be greater when compared against itself") } } func TestLCP(t *testing.T) { assert := assert.New(t) lcp := LongestCommonPrefix("Hello World", "Hell yeah!") assert.Equal("Hell", lcp) lcp = LongestCommonPrefix("Hello World", "Hell yeah!", "hey there") assert.Equal("", lcp) lcp = LongestCommonPrefix() assert.Equal("", lcp) lcp = LongestCommonPrefix("OneWord") assert.Equal("OneWord", lcp) lcp = LongestCommonPrefix("foo", "foobar") assert.Equal("foo", lcp) } var parseNumTests = []struct { s string v int }{ {"twenty nine", 29}, {"one hundred five", 105}, {"five hundred twenty thousand twenty one", 520021}, } func TestParseNumber(t *testing.T) { for _, pnts := range parseNumTests { s := strings.Split(pnts.s, " ") ints, err := StrsToInts(s) if err != nil { t.Error(err) continue } v := CombineInts(ints) if v != pnts.v { t.Errorf("Expected %q to be parsed to %d. Got %d instead", pnts.s, pnts.v, v) } } } ================================================ FILE: corpus/inflection.go ================================================ package corpus import ( "regexp" "github.com/chewxy/lingo" ) type conversionPattern struct { pattern *regexp.Regexp replacement string } func newConversionPattern(from, to string) conversionPattern { rFrom := regexp.MustCompile(from) return conversionPattern{rFrom, to} } // plural -> singular var plural = []conversionPattern{ newConversionPattern("(quiz)$", "${1}zes"), newConversionPattern("^(ox)$", "${1}en"), newConversionPattern("([m|l])ouse$", "${1}ice"), newConversionPattern("(matr|vert|ind)ix|ex$", "${1}ices"), newConversionPattern("(x|ch|ss|sh)$", "${1}es"), newConversionPattern("([^aeiouy]|qu)ies$", "${1}y"), newConversionPattern("([^aeiouy]|qu)y$", "${1}ies"), newConversionPattern("(hive)$", "${1}s"), newConversionPattern("(?:([^f])fe|([lr])f)$", "${1}${2}ves"), newConversionPattern("sis$", "ses"), newConversionPattern("([ti])um$", "${1}a"), newConversionPattern("(buffal|tomat|potat)o$", "${1}oes"), newConversionPattern("(bu)s$", "${1}ses"), newConversionPattern("(alias|status|sex)$", "${1}es"), newConversionPattern("(octop|vir)us$", "${1}i"), newConversionPattern("(ax|test)is$", "${1}es"), newConversionPattern("s$", "s"), newConversionPattern("$", "s"), } // singular -> plural var singular = []conversionPattern{ newConversionPattern("(quiz)zes$", "${1}"), newConversionPattern("(matr)ices$", "${1}ix"), newConversionPattern("(vert|ind)ices$", "${1}ex"), newConversionPattern("^(ox)en", "${1}"), newConversionPattern("(alias|status)es$", "${1}"), newConversionPattern("(octop|vir)i$", "${1}us"), newConversionPattern("(cris|ax|test)es$", "${1}is"), newConversionPattern("(shoe)s$", "${1}"), newConversionPattern("(o)es$", "${1}"), newConversionPattern("(bus)es$", "${1}"), newConversionPattern("([m|l])ice$", "${1}ouse"), newConversionPattern("(x|ch|ss|sh)es$", "${1}"), newConversionPattern("(m)ovies$", "${1}ovie"), newConversionPattern("(s)eries$", "${1}eries"), newConversionPattern("([^aeiouy]|qu)ies$", "${1}y"), newConversionPattern("([lr])ves$", "${1}f"), newConversionPattern("(tive)s$", "${1}"), newConversionPattern("(hive)s$", "${1}"), newConversionPattern("([^f])ves$", "${1}fe"), newConversionPattern("(^analy)ses$", "${1}sis"), newConversionPattern("((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "${1}${2}sis"), newConversionPattern("([ti])a$", "${1}um"), newConversionPattern("(n)ews$", "${1}ews"), newConversionPattern("s$", ""), } // weird pluralizations that don't match the rules above var irregular = []conversionPattern{ newConversionPattern("person", "people"), newConversionPattern("man", "men"), newConversionPattern("child", "children"), newConversionPattern("sex", "sexes"), newConversionPattern("move", "moves"), newConversionPattern("sleeve", "sleeves"), newConversionPattern("datum", "data"), newConversionPattern("box", "boxes"), newConversionPattern("knife", "knives"), } var unconvertable = []string{ "equipment", "information", "rice", "money", "species", "series", "fish", "sheep", } // Pluralize pluralizes words based on rules known func Pluralize(word string) string { if lingo.InStringSlice(word, unconvertable) { return word } for _, cp := range irregular { if cp.pattern.MatchString(word) { return cp.replacement } } for _, cp := range plural { if cp.pattern.MatchString(word) { // log.Printf("\t%q Matches %q", word, cp.pattern.String()) return cp.pattern.ReplaceAllString(word, cp.replacement) } } return word } // Singularize singularizes words based on rules known func Singularize(word string) string { if lingo.InStringSlice(word, unconvertable) { return word } for _, cp := range singular { if cp.pattern.MatchString(word) { return cp.pattern.ReplaceAllString(word, cp.replacement) } } return word } ================================================ FILE: corpus/inflection_test.go ================================================ package corpus import "testing" var pluralizeTest = []struct { word, correct string }{ {"friend", "friends"}, {"tomato", "tomatoes"}, {"knife", "knives"}, {"dwarf", "dwarves"}, {"box", "boxes"}, {"ox", "oxen"}, {"man", "men"}, {"equipment", "equipment"}, } var singularizeTest = []struct { word, correct string }{ {"condolences", "condolence"}, {"fish", "fish"}, {"shoes", "shoe"}, {"viri", "virus"}, {"elves", "elf"}, } func TestPluralize(t *testing.T) { for _, pts := range pluralizeTest { got := Pluralize(pts.word) if got != pts.correct { t.Errorf("Pluralizing %q failed. Want %q. Got %q instead", pts.word, pts.correct, got) } } } func TestSingularize(t *testing.T) { for _, pts := range singularizeTest { got := Singularize(pts.word) if got != pts.correct { t.Errorf("Singularizing %q failed. Want %q. Got %q instead", pts.word, pts.correct, got) } } } ================================================ FILE: corpus/io.go ================================================ package corpus import ( "bufio" "bytes" "encoding/gob" "io" "strconv" "strings" ) // sortutil is a utility struct meant to sort words based on IDs type sortutil struct { words []string ids []int freqs []int } func (s *sortutil) Len() int { return len(s.words) } func (s *sortutil) Less(i, j int) bool { return s.ids[i] < s.ids[j] } func (s *sortutil) Swap(i, j int) { s.words[i], s.words[j] = s.words[j], s.words[i] s.ids[i], s.ids[j] = s.ids[j], s.ids[i] if len(s.freqs) > 0 { s.freqs[i], s.freqs[j] = s.freqs[j], s.freqs[i] } } // ToDictWithFreq returns a simple marshalable type. Conceptually it's a JSON object with the words as the keys. The values are a pair - ID and Freq. func ToDictWithFreq(c *Corpus) map[string]struct{ ID, Freq int } { retVal := make(map[string]struct{ ID, Freq int }) for i, w := range c.words { retVal[w] = struct{ ID, Freq int }{i, c.frequencies[i]} } return retVal } // ToDict returns a marshalable dict. It returns a copy of the ID mapping. func ToDict(c *Corpus) map[string]int { retVal := make(map[string]int) for k, v := range c.ids { retVal[k] = v } return retVal } // GobEncode implements GobEncoder for *Corpus func (c *Corpus) GobEncode() ([]byte, error) { var buf bytes.Buffer encoder := gob.NewEncoder(&buf) if err := encoder.Encode(c.words); err != nil { return nil, err } if err := encoder.Encode(c.ids); err != nil { return nil, err } if err := encoder.Encode(c.frequencies); err != nil { return nil, err } if err := encoder.Encode(c.maxid); err != nil { return nil, err } if err := encoder.Encode(c.totalFreq); err != nil { return nil, err } if err := encoder.Encode(c.maxWordLength); err != nil { return nil, err } return buf.Bytes(), nil } // GobDecode implements GobDecoder for *Corpus func (c *Corpus) GobDecode(buf []byte) error { b := bytes.NewBuffer(buf) decoder := gob.NewDecoder(b) if err := decoder.Decode(&c.words); err != nil { return err } if err := decoder.Decode(&c.ids); err != nil { return err } if err := decoder.Decode(&c.frequencies); err != nil { return err } if err := decoder.Decode(&c.maxid); err != nil { return err } if err := decoder.Decode(&c.totalFreq); err != nil { return err } if err := decoder.Decode(&c.maxWordLength); err != nil { return err } return nil } // LoadOneGram loads a 1_gram.txt file, which is a tab separated file which lists the frequency counts of words. Example: // the 23135851162 // of 13151942776 // and 12997637966 // to 12136980858 // a 9081174698 // in 8469404971 // for 5933321709 func (c *Corpus) LoadOneGram(r io.Reader) error { scanner := bufio.NewScanner(r) for scanner.Scan() { line := scanner.Text() splits := strings.Split(line, "\t") if len(splits) == 0 { break } word := splits[0] // TODO: normalize count, err := strconv.Atoi(splits[1]) if err != nil { return err } id := c.Add(word) c.frequencies[id] = count c.totalFreq-- c.totalFreq += count wc := len([]rune(word)) if wc > c.maxWordLength { c.maxWordLength = wc } } return nil } ================================================ FILE: corpus/io_test.go ================================================ package corpus import ( "bytes" "encoding/gob" "strings" "testing" "github.com/stretchr/testify/assert" ) func TestCorpusGob(t *testing.T) { buf := new(bytes.Buffer) c := New() c.Add("Hello") c.Add("World") helloID, _ := c.Id("Hello") worldID, _ := c.Id("World") encoder := gob.NewEncoder(buf) decoder := gob.NewDecoder(buf) if err := encoder.Encode(c); err != nil { t.Fatal(err) } c2 := New() if err := decoder.Decode(c2); err != nil { t.Fatal(err) } if hid, ok := c2.Id("Hello"); !ok || (ok && hid != helloID) { t.Errorf("\"Hello\" not found after decoding.") } if wid, ok := c2.Id("World"); !ok || (ok && wid != worldID) { t.Errorf("\"World\" not found after decoding.") } } func TestCorpusToDict(t *testing.T) { assert := assert.New(t) c, _ := Construct(WithWords([]string{"World", "Hello", "World"})) d := ToDict(c) c2, err := Construct(FromDict(d)) if err != nil { t.Fatal(err) } assert.Equal(c.words, c2.words, "Expected words to be the same") assert.Equal(c.ids, c2.ids, "Expected IDs to be the same") assert.NotEqual(c.frequencies, c2.frequencies, "Expected frequencies to not be the same") assert.Equal(c.maxid, c2.maxid, "Expected maxID to be the same") assert.NotEqual(c.totalFreq, c2.totalFreq, "Expected totalFreq to be different.") assert.Equal(c.maxWordLength, c2.maxWordLength, "Expected maxWordLength to be the same") } func TestCorpusToDictWithFreq(t *testing.T) { assert := assert.New(t) c, _ := Construct(WithWords([]string{"World", "Hello", "World"})) d := ToDictWithFreq(c) c2, err := Construct(FromDictWithFreq(d)) if err != nil { t.Fatal(err) } assert.Equal(c, c2) } func TestLoadOneGram(t *testing.T) { assert := assert.New(t) r := strings.NewReader(sample1Gram) c := New() err := c.LoadOneGram(r) assert.Nil(err) assert.Equal(10, c.Size()) id, ok := c.Id("for") if !ok { t.Errorf("Expected \"for\" to be in corpus after loading one gram file") } assert.Equal(int(c.maxid-1), id) } ================================================ FILE: corpus/lda.go ================================================ package corpus import ( "gorgonia.org/tensor" ) // LDAModel ... TODO //https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation type LDAModel struct { // params Alpha tensor.Tensor // is a Row Eta tensor.Tensor // is a Col //Kappa gorgonia.Scalar // Decay //Tau0 gorgonia.Scalar // offset // parameters needed for working Topics int ChunkSize int Terms int UpdateEvery int EvalEvery int // consts Iterations int GammaThreshold float64 MinimumProb float64 // track current progress Updates int // type Dtype tensor.Dtype } func (l *LDAModel) init() { eta := tensor.New(tensor.Of(l.Dtype), tensor.WithShape(l.Topics)) alpha := tensor.New(tensor.Of(l.Dtype), tensor.WithShape(l.Topics)) switch l.Dtype { case tensor.Float64: v := 1.0 / float64(l.Topics) eta.Memset(v) alpha.Memset(v) case tensor.Float32: v := float32(1) / float32(l.Topics) eta.Memset(v) alpha.Memset(v) } l.Alpha = alpha l.Eta = eta } ================================================ FILE: corpus/test_test.go ================================================ package corpus import ( "strings" "github.com/chewxy/lingo/treebank" ) const sample1Gram = `the 23135851162 of 13151942776 and 12997637966 to 12136980858 a 9081174698 in 8469404971 for 5933321709` func mediumSentence() []treebank.SentenceTag { conllu := `1 President President PROPN NNP Number=Sing 2 compound _ _ 2 Bush Bush PROPN NNP Number=Sing 5 nsubj _ _ 3 on on ADP IN _ 4 case _ _ 4 Tuesday Tuesday PROPN NNP Number=Sing 5 nmod _ _ 5 nominated nominate VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ _ 6 two two NUM CD NumType=Card 7 nummod _ _ 7 individuals individual NOUN NNS Number=Plur 5 dobj _ _ 8 to to PART TO _ 9 mark _ _ 9 replace replace VERB VB VerbForm=Inf 5 advcl _ _ 10 retiring retire VERB VBG VerbForm=Ger 11 amod _ _ 11 jurists jurist NOUN NNS Number=Plur 9 dobj _ _ 12 on on ADP IN _ 14 case _ _ 13 federal federal ADJ JJ Degree=Pos 14 amod _ _ 14 courts court NOUN NNS Number=Plur 11 nmod _ _ 15 in in ADP IN _ 18 case _ _ 16 the the DET DT Definite=Def|PronType=Art 18 det _ _ 17 Washington Washington PROPN NNP Number=Sing 18 compound _ _ 18 area area NOUN NN Number=Sing 14 nmod _ _ 19 . . PUNCT . _ 5 punct _ _ ` readr := strings.NewReader(conllu) return treebank.ReadConllu(readr) } const EPSILON64 float64 = 1e-10 func floatEquals64(a, b float64) bool { if (a-b) < EPSILON64 && (b-a) < EPSILON64 { return true } return false } ================================================ FILE: corpus/utils.go ================================================ package corpus import ( "errors" "math" ) func minInt(a, b int) int { if a < b { return a } return b } func maxInt(a, b int) int { if a > b { return a } return b } func dot(a, b []float64) (float64, error) { if len(a) != len(b) { return 0, errors.New("Differing lengths!") } var retVal float64 for i, v := range a { retVal += v * b[i] } return retVal, nil } func mag(a []float64) (float64, error) { dotProd, err := dot(a, a) if err != nil { return dotProd, err } return math.Sqrt(dotProd), nil } ================================================ FILE: dep/README.md ================================================ # Dependency Parser # Package `dependencyparser` is a package that provides data structures and algorithms for a dependency parser as described by [Chen and Manning 2014](http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf) [PDF]. It achieves similar accuracy scores as the the cited paper. # Installing # `go get -u github.com/chewxy/lingo/dep` # How It Works # ## Transition Based Parsing ## The core of the parser is a transition based parser, as popularized by [Nivre 2003](https://stp.lingfil.uu.se/~nivre/docs/iwpt03.pdf) [PDF]. It's essentially a [shift-reduce parser](https://en.wikipedia.org/wiki/Shift-reduce_parser) with more states. Dan Jurafsky has a very [complete overview of transition-based parsing](https://web.stanford.edu/~jurafsky/slp3/14.pdf) [PDF], which should be consulted should more questions arise. ### Transitions ### At the core of a transition based parser are two data structures: a stack and a queue. The queue, or buffer holds a list of words waiting to be parsed. Parsing is then simply a matter of manipulating the state of the stack and queue. Specifically there are three possible actions in an arc-standard parser: * `Shift`: Shift simply shifts one word from the buffer on to the top of the stack * `Left`: Left means the top of the stack is the head of the word underneath it. After the transition is applied (the link between the nodes attached), the word underneath the stack is removed. * `Right`: Right means that the top of the stack is the child of the word underneath it. After the transition is applied, the top of the stack is popped. A word on the terms "head", and "child". Consider the sentence "I am human": !["I am human" example](https://github.com/chewxy/lingo/blob/master/dep/documentation/iamhuman.dot.png?raw=true) We say "human" is the head of the words "I" and "am". Therefore, "I" and "am" are considered to be children of "human". ### Example ### Let's look at a simple example to concrefy the ideas: "The cat sat on the mat". Here are the states | Step | Stack | Buffer | Transition | |------|-------------------------------|-------------------------------------------|------------| |0 | [ROOT] | ["The", "cat", "sat", "on", "the", "mat"] | Shift | |1 | [ROOT, "The"] | ["cat", "sat", "on", "the", "mat"] | Shift | |2 | [ROOT, "The", "cat"] | ["sat", "on", "the", "mat"] | Left | |3 | [ROOT, "cat"] | ["sat", "on", "the", "mat"] | Shift | |4 | [ROOT, "cat", "sat"] | ["on", "the", "mat"] | Left | |5 | [ROOT, "sat"] | ["on", "the", "mat"] | Shift | |6 | [ROOT, "sat", "on"] | ["the", "mat"] | Shift | |7 | [ROOT, "sat", "on", "the"] | ["mat"] | Shift | |8 | [ROOT, "sat", "on", "the", "mat"] | [] | Left | |9 | [ROOT, "sat", "on", "mat"] | [] | Left | |10| [ROOT, "sat", "mat"] | [] | Right | |11| [ROOT, "sat"] | [] | Left | The above transitions produces this parse tree: !["the cat sat on the mat"](https://github.com/chewxy/lingo/blob/master/dep/documentation/thecatsatonthemat.dot.png?raw=true) The real question then is of course - how does the system know which is the correct transition to emit, given the state? The answer is machine learning. ## Machine Learning ## What exactly are we learning? Or more carefully put, what are the inputs and outputs of the machine learning algorithm? The table in the example above provides a template for the inputs and output. The output is easy - the transition is what we want to learn. As for the input, it's a little bit more complex. The input consists of the stack and the buffer. It'd be impractical and slow to include everything in the stack and buffer (dynamic neural networks are somewhat slower than static ones). So Chen and Manning came up with an ingenious idea - * Use the top 3 words of the stack * Use the top 3 words of the buffer * Use the first and second leftmost/rightmost children of the first two words of the stack Instead of directly using the words, POS Tag and dependency relations as features, the rather ingenious idea was that it would use vectors drawn from an embedding matrix to represent these features instead. So instead of building sparse features, concatenating the vectors form a fixed sized input vector. This makes training the network much more expedient. You'll find this in [features.go](https://github.com/chewxy/lingo/blob/master/dependencyParser/features.go) Given each state above, it'd be fairly trivial to extract an input vector based on the 18 "features" listed and feed forwards to a neural network. The result is a fast parser. ### Neural Network ### The machine learning algorithm behind this parser is a simple 3-layered network. An input layer is constructed from the embedding matrices, and is forwarded to the first layer, which is activated by a cube activation function. This then passes forwards to a dropout layer before the last layer, which is a softmax layer. [image of NN] ## Hairy Bits ## The hairy bits of this is the oracle. Specifically, the question: given a training sentence, how do we generate correct examples such as the table above? TODO: finish writing this section # How To Use # This package provides three main data structures for use: * `Parser` * `Model` * `Trainer` `Trainer` takes a `[]treebank.SentenceTag` and produces a `Model`. `Parser` requires a `Model` to run, and is basically a exported wrapper over `configuration` that handles a pipeline. ## Basic NLP Pipeline ## ```go func main() { inputString: `The cat sat on the mat` lx := lexer.New("dummy", strings.NewReader(inputString)) // lexer - required to break a sentence up into words. pt := pos.New(pos.WithModel(posModel)) // POS Tagger - required to tag the words with a part of speech tag. dp := dep.New(depModel) // Creates a new parser // set up a pipeline pt.Input = lx.Output dp.Input = pt.Output // run all go lx.Run() go pt.Run() go dp.Run() // wait to receive: for { select { case d := <- dp.Output: // do something case err:= <-dp.Error: // handle error } } } ``` ## Training A Model ## To train a model you'd use the `Trainer`. The trainer accepts a `[]treebank.SentenceTag`. As long as you can parse your training file into those (package `treebank` accepts CONLLU formatted files as well as the PennTreebank formatted files), you'd be fine. An example trainer is in the cmd directory of `lingo` # FAQ # **Why not an LSTM or RNN to encode the state of the stack and buffer?** The answer is simplicity and speed. I have attempted variants of the parser with different neural networks - they don't work as fast as this. I am aware of Parsey-McParseface and the slightly improved accuracy compared to this model, but the speed has been not as great as I expect. This package emphasises parsing speed over accuracy - for most well written English sentences, this package performs well. **Why are there no models?** I'm afraid you're gonna have to train your own models. Training takes days on the Universal Dependency dataset and I haven't had the time to train on those. All my models are specific to the use of the company, and hence cannot be released. **What caveats are there?** Chen and Manning described using pre-computed activations for the top 10000 or so words. I did not implement that, but it would be trivial to revisit and implement it. Feel free to send a pull request. **How can this be sped up?** Use multiple, smaller trainers, each training on a separate batch. You can hence train them concurrently (pass the costs in a channel and collect at the end). At the end, sum the gradients before applying adagrad. The trade off is that a LOT more memory will be used. It's also the reason why it wasn't included as the default. It's quite trivial to write though. Send a pull request if you have managed to reduce memory usage. # Contributing # see package lingo's CONTRIBUTING.md for more information. There is currently a list of issues in Github issues. Those are good places to start. # Licence # This package is MIT licenced. ================================================ FILE: dep/arcStandard.go ================================================ package dep import "github.com/chewxy/lingo" // var SingleRoot bool = true // make this part of a build process // canApply checks if a particular transition can be applied func (c *configuration) canApply(t transition) bool { var h head if t.Move == Left || t.Move == Right { if t.Move == Left { h = c.stackValue(0) } else { h = c.stackValue(1) } if h < 0 { return false } if h == 0 && t.DependencyType != lingo.Root { return false } } stackSize := c.stackSize() bufferSize := c.bufferSize() if t.Move == Left { return stackSize > 2 } if t.Move == Right { return stackSize > 2 || (stackSize == 2 && bufferSize == 0) // if not single root build // return stackSize >= 2 } return bufferSize > 0 // strange other thing... } // apply applies the transition func (c *configuration) apply(t transition) { logf("Applying %v", t) w1 := int(c.stackValue(1)) w2 := int(c.stackValue(0)) if t.Move == Left { c.AddArc(w2, w1, t.DependencyType) c.removeSecondTopStack() } else if t.Move == Right { c.AddArc(w1, w2, t.DependencyType) c.removeTopStack() } else { c.shift() } } // oracle gets the gold transition given the state func (c *configuration) oracle(goldParse *lingo.Dependency) (t transition) { w1 := int(c.stackValue(1)) w2 := int(c.stackValue(0)) if w1 > 0 && goldParse.Head(w1) == w2 { t.Move = Left t.DependencyType = goldParse.Label(w1) return } else if w1 >= 0 && goldParse.Head(w2) == w1 && !c.hasOtherChildren(w2, goldParse) { t.Move = Right t.DependencyType = goldParse.Label(w2) return } return // default transition is Shift } ================================================ FILE: dep/arcStandard_test.go ================================================ package dep import ( "testing" "github.com/chewxy/lingo" "github.com/stretchr/testify/assert" ) func TestCanApply(t *testing.T) { dep := simpleSentence()[0].Dependency(dummyFix{}) buffer := make([]head, 0) for i := 1; i < dep.WordCount(); i++ { buffer = append(buffer, head(i)) } stack := []head{0} c := &configuration{ Dependency: dep, stack: stack, buffer: buffer, } assert := assert.New(t) logf("Start config: \n%v", c) rootLeft := c.canApply(transition{Left, lingo.Root}) rootRight := c.canApply(transition{Right, lingo.Root}) NSubjLeft := c.canApply(transition{Left, lingo.NSubj}) NSubjRight := c.canApply(transition{Right, lingo.NSubj}) ShiftDep := c.canApply(transition{Shift, lingo.NoDepType}) assert.Equal(false, rootLeft, "rootLeft should be false") assert.Equal(false, rootRight, "rootRight should be false") assert.Equal(false, NSubjLeft, "NSubjLeft should be false") assert.Equal(false, NSubjRight, "NSubjRight should be false") assert.Equal(true, ShiftDep, "ShiftDep should be true") logf("rootRight: %v, rootLeft: %v", rootLeft, rootRight) logf("NSubjRight: %v, NSubjLeft: %v", NSubjRight, NSubjLeft) logf("ShiftDep: %v", ShiftDep) c.shift() c.shift() logf("%v", c) rootLeft = c.canApply(transition{Left, lingo.Root}) rootRight = c.canApply(transition{Right, lingo.Root}) NSubjLeft = c.canApply(transition{Left, lingo.NSubj}) NSubjRight = c.canApply(transition{Right, lingo.NSubj}) ShiftDep = c.canApply(transition{Shift, lingo.NoDepType}) assert.Equal(true, rootLeft, "rootLeft should be true") assert.Equal(true, rootRight, "rootRight should be true") assert.Equal(true, NSubjLeft, "NSubjLeft should be true") assert.Equal(true, NSubjRight, "NSubjRight should be true") assert.Equal(true, ShiftDep, "ShiftDep should be true") logf("rootRight: %v, rootLeft: %v", rootLeft, rootRight) logf("NSubjRight: %v, NSubjLeft: %v", NSubjRight, NSubjLeft) logf("ShiftDep: %v", ShiftDep) } func TestOracle(t *testing.T) { st := simpleSentence()[0] s := st.AnnotatedSentence(nil) c := newConfiguration(s, true) d := s.Dependency() for count := 0; !c.isTerminal() && count < 100; count++ { oracle := c.oracle(d) if !c.canApply(oracle) && (oracle != transition{Right, lingo.Root}) { t.Errorf("Cannot apply %v", oracle) break } c.apply(oracle) } assert.Equal(t, d.Heads(), c.Heads()) } ================================================ FILE: dep/configuration.go ================================================ package dep import ( "fmt" "github.com/chewxy/lingo" ) // describes the current state of the parser type head int const ( DOES_NOT_EXIST head = iota - 1 ) // configuration is the meat of the shift-reduce parsing. It holds the state for the shift reduction type configuration struct { *lingo.Dependency stack []head buffer []head bp int // buffer pointer - starts at 0, increments } func newConfiguration(sentence lingo.AnnotatedSentence, fromGold bool) *configuration { if fromGold { sentence = sentence.Clone() } dep := lingo.NewDependency(lingo.FromAnnotatedSentence(sentence), lingo.AllocTree()) dep.SetID() sentence = sentence[1:] // because the POSTagger automatically adds a ROOTTAG at the end of it var buffer []head for i := 1; i <= len(sentence); i++ { buffer = append(buffer, head(i)) } var stack []head stack = append(stack, head(0)) // add root return &configuration{ Dependency: dep, stack: stack, buffer: buffer, } } func (c *configuration) String() string { return fmt.Sprintf("Stack: %v Buffer(%d): %v", c.stack, c.bp, c.buffer[c.bp:]) } func (c *configuration) GoString() string { return fmt.Sprintf("Stack: %v Buffer(%d): %v\nHeads: %v\nRels: %v\n", c.stack, c.bp, c.buffer[c.bp:], c.Heads(), c.Labels()) } func (c *configuration) bufferSize() int { return len(c.buffer) - c.bp } func (c *configuration) stackSize() int { return len(c.stack) } func (c *configuration) head(i int) head { heads := c.Heads() // TODO: maybe some sanity checks? return head(heads[i]) } // gets the sentence index of the ith word on the stack. If there isn't anything on the stack, it returns DOES_NOT_EXIST func (c *configuration) stackValue(i int) head { size := c.stackSize() if i >= size || i < 0 { return DOES_NOT_EXIST } return c.stack[size-1-i] } func (c *configuration) bufferValue(i int) head { size := c.bufferSize() if i >= size { return DOES_NOT_EXIST } return c.buffer[i+c.bp] } /* stack machinations */ // pop pops the stack. It isn't really used any more. removeStack(), removeTopStack() and removeSecondTopStack() has superseded its function func (c *configuration) pop() head { retVal := c.stack[len(c.stack)-1] c.stack = c.stack[0 : len(c.stack)-1] return retVal } // removes a value from the stack. func (c *configuration) removeStack(i int) { c.stack = c.stack[:i+copy(c.stack[i:], c.stack[i+1:])] } // removeSecondTopStack removes the 2nd-to-last element func (c *configuration) removeSecondTopStack() bool { stackSize := c.stackSize() if stackSize < 2 { return false } i := stackSize - 2 c.removeStack(i) return true } func (c *configuration) removeTopStack() bool { stackSize := c.stackSize() if stackSize < 1 { return false } i := stackSize - 1 c.removeStack(i) return true } /* Dependency related stuff */ func (c *configuration) label(i head) lingo.DependencyType { if i < 0 { return lingo.NoDepType } if i == 0 { return lingo.NoDepType } return c.Label(int(i)) // i-- // labels := c.Labels() // return labels[i] } func (c *configuration) annotation(i head) *lingo.Annotation { if i < 0 { return lingo.NullAnnotation() } if i == 0 { return lingo.RootAnnotation() } // i-- return c.Annotation(int(i)) // return c.Sentence()[i] } // gets the jth left child of the ith word of a sentence func (c *configuration) lc(k, cnt head) head { if k < 0 || int(k) > c.N() { return DOES_NOT_EXIST } cc := 0 for i := 1; i < int(k); i++ { if c.Head(i) == int(k) { cc++ if int(cnt) == cc { return head(i) } } } return DOES_NOT_EXIST } func (c *configuration) rc(k, cnt head) head { if k < 0 || int(k) > c.N() { return DOES_NOT_EXIST } cc := 0 for i := c.N(); i > int(k); i-- { if c.Head(i) == int(k) { cc++ if cc == int(cnt) { return head(i) } } } return DOES_NOT_EXIST } func (c *configuration) hasOtherChildren(i int, goldParse *lingo.Dependency) bool { for j := 1; j <= goldParse.N(); j++ { if goldParse.Head(j) == i && c.Head(j) != i { return true } } return false } func (c *configuration) isTerminal() bool { return c.stackSize() == 1 && c.bufferSize() == 0 } // Actual Transitioning stuff func (c *configuration) shift() bool { i := c.bufferValue(0) if i == DOES_NOT_EXIST { return false } c.bp++ // move the buffer pointer up c.stack = append(c.stack, i) // push to it.... gotta work the pop return true } ================================================ FILE: dep/configuration_test.go ================================================ package dep import ( "testing" "github.com/chewxy/lingo" "github.com/stretchr/testify/assert" ) func TestStackAppendRemove(t *testing.T) { sentence := mediumSentence()[0] as := sentence.AnnotatedSentence(dummyFix{}) c := newConfiguration(as, true) t.Logf("C: %v", c) t.Logf("C: %#v", c) assert := assert.New(t) c.stack = append(c.stack, 200) assert.Equal([]head{0, 200}, c.stack, "stack is not equal after appending") c.removeTopStack() assert.Equal([]head{0}, c.stack, "stack is not equal after removeTopStack") c.stack = append(c.stack, 200) c.removeSecondTopStack() assert.Equal([]head{200}, c.stack, "stack is not equal after removeSecondTopStack()") correctHeads := []int{-1} // the -1 is the root correctHeads = append(correctHeads, sentence.Heads...) correctLabels := []lingo.DependencyType{lingo.Root} correctLabels = append(correctLabels, sentence.Labels...) dep := sentence.Dependency(dummyFix{}) assert.Equal(correctHeads, dep.Heads(), "Heads are not equal") assert.Equal(correctLabels, dep.Labels(), "Labels are not equal %v \n %v", correctLabels, dep.Labels()) } func TestConfiguration_StackValue(t *testing.T) { c := new(configuration) c.stack = []head{0, 1, 2, 5, 6} zero := c.stackValue(0) one := c.stackValue(1) four := c.stackValue(4) five := c.stackValue(5) negone := c.stackValue(-1) assert := assert.New(t) assert.Equal(head(6), zero, "Zeroth value not the same") assert.Equal(head(5), one, "First value not the same") assert.Equal(head(0), four, "Fourth value not the same") assert.Equal(DOES_NOT_EXIST, five, "Fifth value not the same") assert.Equal(DOES_NOT_EXIST, negone, "NegOne value not the same") } ================================================ FILE: dep/debug.go ================================================ // +build debug package dep import ( "bytes" "fmt" "log" "runtime" "strings" "sync/atomic" "github.com/chewxy/lingo" ) const BUILD_DEBUG = "PARSER: DEBUG BUILD" const BUILD_DIAG = "Diagnostic Build" const DEBUG = true var READMEMSTATS = true var TABCOUNT uint32 = 0 func tabcount() int { return int(atomic.LoadUint32(&TABCOUNT)) } func enterLoggingContext() { atomic.AddUint32(&TABCOUNT, 1) tc := tabcount() log.SetPrefix(strings.Repeat("\t", tc)) } func leaveLoggingContext() { tc := tabcount() tc-- if tc < 0 { atomic.StoreUint32(&TABCOUNT, 0) tc = 0 } else { atomic.StoreUint32(&TABCOUNT, uint32(tc)) } log.SetPrefix(strings.Repeat("\t", tc)) } func logf(format string, others ...interface{}) { if !DEBUG { return } log.Printf(format, others...) } func logTrainingProgress(iteration, correct, total, length, possibles int) { if !DEBUG { return } log.Printf("Iteration %d. Correct/Total: %d/%d = %.2f", iteration, correct, total, float64(correct)/float64(total)) log.Printf("DictSize: %d/%d, load factor of: %.2f", length, possibles, float64(length)/float64(possibles)) } func logMemStats() { if !DEBUG || !READMEMSTATS { return } var mem runtime.MemStats runtime.ReadMemStats(&mem) log.Printf("Allocated : %.2f MB", (float64(mem.Alloc)/1024)/float64(1024)) log.Printf("Total Allocated : %.2f MB", (float64(mem.TotalAlloc)/1024)/float64(1024)) log.Printf("Heap Allocted : %.2f MB", (float64(mem.HeapAlloc)/1024)/float64(1024)) log.Printf("Sys Total Allocated: %.2f MB", (float64(mem.HeapSys)/1024)/float64(1024)) log.Println("----------") } func recoverFrom(format string, attrs ...interface{}) { if r := recover(); r != nil { log.Printf(format, attrs...) panic(r) } } /* Nice output of shit */ func (d *Parser) SprintFeatures(features []int) string { // tabcount := int(atomic.LoadUint32(&TABCOUNT)) var buf bytes.Buffer for i := 0; i < 18; i++ { number := features[i] id := number - wordFeatsStartAt word, _ := d.corpus.Word(id) if word == "" { word = "-NULL-" } buf.WriteString(fmt.Sprintf("%d, %q, %d \n", feature(i), word, number)) } for i := 0; i < 18; i++ { number := features[i+18] buf.WriteString(fmt.Sprintf("%d, %v, %d\n", feature(i+18), lingo.POSTag(number), number)) } for i := 0; i < 12; i++ { number := features[i+36] id := number - labelFeatsStartAt buf.WriteString(fmt.Sprintf("%d, %v, %d\n", feature(i+36), lingo.DependencyType(id), number)) } return buf.String() } func SprintScores(scores []float64, ts []transition) string { var buf bytes.Buffer for i, v := range scores { if i >= len(ts) { buf.WriteString(fmt.Sprintf("UNKNOWN TRANSITION, %v\n", v)) continue } buf.WriteString(fmt.Sprintf("%v, %v\n", ts[i], v)) } return buf.String() } func SprintFloatSlice(a []float64) string { var buf bytes.Buffer buf.WriteString("[") for i, v := range a { if i < len(a)-1 { buf.WriteString(fmt.Sprintf("%v, ", v)) } else { buf.WriteString(fmt.Sprintf("%v", v)) } } buf.WriteString("]") return buf.String() } ================================================ FILE: dep/dependencyParser.go ================================================ package dep import ( "fmt" "github.com/chewxy/lingo" "github.com/chewxy/lingo/corpus" "github.com/pkg/errors" ) var KnownWords *corpus.Corpus // package provided global // Parser is the object that performs the dependency parsing // It contains a neural network, which is the core of it. // // The same object can be used to train the NN type Parser struct { Input chan lingo.AnnotatedSentence Output chan *lingo.Dependency Error chan error *Model } // New creates a new Parser func New(m *Model) *Parser { d := &Parser{ Output: make(chan *lingo.Dependency), Error: make(chan error), Model: m, } return d } // Run is used when using the NN to parse a sentence. For training, see Train() func (d *Parser) Run() { defer close(d.Output) for sentence := range d.Input { dep, err := d.predict(sentence) if err != nil { d.Error <- err return } d.Output <- dep } return } func (d *Parser) predict(sentence lingo.AnnotatedSentence) (*lingo.Dependency, error) { // defer func() { // if r := recover(); r != nil { // log.Printf("Parsing for %q", sentence.ValueString()) // panic(r) // } // }() c := newConfiguration(sentence, false) var err error var argmax int var count int for !c.isTerminal() && count < 100 { logf("%v", c) if count == 99 { logf("TARPIT") } features := getFeatures(c, d.corpus) // features2 := getFeatureArray(c, d.dict) if argmax, err = d.nn.pred(features); err != nil { return nil, err } // log.Printf("Argmax: %v, len(d.ts): %v, len(transitions) %v", argmax, len(d.ts), len(transitions)) t := transitions[argmax] // no this is NOT a mistake if !c.canApply(t) { t = transition{Shift, lingo.NoDepType} // reset // manual argmaxing switch scores := d.nn.scores.Value().Data().(type) { case []float32: var maxScore float32 for i, kt := range d.ts { if scores[i] > maxScore && c.canApply(kt) { maxScore = scores[i] t = kt } } case []float64: var maxScore float64 for i, kt := range d.ts { if scores[i] > maxScore && c.canApply(kt) { maxScore = scores[i] t = kt } } default: return nil, errors.Errorf("Unhandled score type %T", d.nn.scores.Value()) } } c.apply(t) count++ } fix(c.Dependency) return c.Dependency, err } func (d *Parser) String() string { var nns, ds string if d.corpus != nil { ds = fmt.Sprintf("\nDict Size: %d words\nMAXTAG: %d\nMAXDEPTYPE: %d\n", d.corpus.Size(), lingo.MAXTAG, lingo.MAXDEPTYPE) } else { ds = "\n" } if d.nn != nil && d.nn.initialized() { nns = fmt.Sprintf("\nNeural Network:\n=================\n%v\n", d.nn) } if !d.nn.initialized() { panic(fmt.Sprintf("%v", d.nn)) } base := "\n\nDependency Parser Info:\n=======================\n" return base + ds + nns } ================================================ FILE: dep/documentation/iamhuman.dot ================================================ digraph G { Node_0xc425b88740->Node_0xc425b88780[ label=Root ]; Node_0xc425b88780->Node_0xc425b88800[ label=Cop ]; Node_0xc425b88780->Node_0xc425b887c0[ label=NSubj ]; Node_0xc425b88740 [ label="0: "-ROOT-/ROOT_TAG"" ]; Node_0xc425b88780 [ label="3: "human/JJ"" ]; Node_0xc425b887c0 [ label="1: "I/PRP"" ]; Node_0xc425b88800 [ label="2: "am/VBP"" ]; } ================================================ FILE: dep/documentation/thecatsatonthemat.dot ================================================ digraph G { Node_0xc4349eeec0->Node_0xc4349eef80[ label=Root ]; Node_0xc4349eef80->Node_0xc4349eefc0[ label=NMod ]; Node_0xc4349eefc0->Node_0xc4349ef040[ label=Det ]; Node_0xc4349eef80->Node_0xc4349eef00[ label=NSubj ]; Node_0xc4349eef00->Node_0xc4349eef40[ label=Det ]; Node_0xc4349eefc0->Node_0xc4349ef000[ label=Case ]; Node_0xc4349eeec0 [ label="0: "-ROOT-/ROOT_TAG"" ]; Node_0xc4349eef00 [ label="2: "cat/NN"" ]; Node_0xc4349eef40 [ label="1: "the/DT"" ]; Node_0xc4349eef80 [ label="3: "sat/VBD"" ]; Node_0xc4349eefc0 [ label="6: "mat/NN"" ]; Node_0xc4349ef000 [ label="4: "on/IN"" ]; Node_0xc4349ef040 [ label="5: "the/DT"" ]; } ================================================ FILE: dep/errors.go ================================================ package dep import ( "fmt" "github.com/chewxy/lingo" ) type componentUnavailable string func (c componentUnavailable) Error() string { return fmt.Sprintf("%v unavailable", c) } func (c componentUnavailable) Component() string { return string(c) } // TarpitError is an error when the arc-standard is stuck. // It implements GoStringer, which when called will output the state as a string. // It also implements lingo.Sentencer, so the offending sentence can easily be retrieved type TarpitError struct{ *configuration } func (err TarpitError) Error() string { return "Tarpit Error" } // NonProjective error is the error that is emitted when the dependency tree is not projective (that is to say the children cross lines) type NonProjectiveError struct{ *lingo.Dependency } func (err NonProjectiveError) Error() string { return "Non-projective tree" } ================================================ FILE: dep/evaluation.go ================================================ package dep import ( "fmt" "io/ioutil" "github.com/chewxy/lingo" "github.com/chewxy/lingo/treebank" ) // Performance is a tuple that holds performance information from a training session type Performance struct { Iter int // which training iteration is this? UAS float64 // Unlabelled Attachment Score LAS float64 // Labeled Attachment Score UEM float64 // Unlabelled Exact Match Root float64 // Correct Roots Ratio } func (p Performance) String() string { s := `EPO: %d UAS: %.5f LAS: %.5f UEM: %.5f ROO: %.5f` return fmt.Sprintf(s, p.Iter, p.UAS, p.LAS, p.UEM, p.Root) } // performance evaluation related code goes here // Evaluate compares predicted trees with the gold standard trees and returns a Performance. It panics if the number of predicted trees and the number of gold trees aren't the same func Evaluate(predictedTrees, goldTrees []*lingo.Dependency) Performance { if len(predictedTrees) != len(goldTrees) { panic(fmt.Sprintf("%d predicted trees; %d gold trees. Unable to compare", len(predictedTrees), len(goldTrees))) } var correctLabels, correctHeads, correctTrees, correctRoot, sumArcs float64 var check int for i, tr := range predictedTrees { gTr := goldTrees[i] if len(tr.AnnotatedSentence) != len(gTr.AnnotatedSentence) { sumArcs += float64(gTr.N()) // log.Printf("WARNING: %q and %q do not have the same length", tr, gTr) continue } var nCorrectHead int for j, a := range tr.AnnotatedSentence[1:] { b := gTr.AnnotatedSentence[j+1] if a.HeadID() == b.HeadID() { correctHeads++ nCorrectHead++ } if a.DependencyType == b.DependencyType { correctLabels++ } sumArcs++ } if nCorrectHead == gTr.N() { correctTrees++ } if tr.Root() == gTr.Root() { correctRoot++ } // check 5 per iteration if check < 5 { logf("predictedHeads: \n%v\n%v\n", tr.Heads(), gTr.Heads()) logf("Ns: %v | %v || Correct: %v", tr.N(), gTr.N(), nCorrectHead) check++ } } uas := correctHeads / sumArcs las := correctLabels / sumArcs uem := correctTrees / float64(len(predictedTrees)) roo := correctRoot / float64(len(predictedTrees)) return Performance{UAS: uas, LAS: las, UEM: uem, Root: roo} } func (t *Trainer) crossValidate(st []treebank.SentenceTag) Performance { preds := t.predMany(st) golds := make([]*lingo.Dependency, len(st)) for i, s := range st { golds[i] = s.Dependency(t) } return Evaluate(preds, golds) } func (t *Trainer) predMany(sentenceTags []treebank.SentenceTag) []*lingo.Dependency { retVal := make([]*lingo.Dependency, len(sentenceTags)) for i, st := range sentenceTags { dep, err := t.pred(st.AnnotatedSentence(t)) if err != nil { ioutil.WriteFile("fullGraph.dot", []byte(t.nn.g.ToDot()), 0644) panic(fmt.Sprintf("%+v", err)) } retVal[i] = dep } return retVal } func (t *Trainer) pred(as lingo.AnnotatedSentence) (*lingo.Dependency, error) { d := new(Parser) d.Model = t.Model return d.predict(as) } ================================================ FILE: dep/example.go ================================================ package dep import ( "math/rand" "github.com/chewxy/lingo" "github.com/chewxy/lingo/corpus" "github.com/chewxy/lingo/treebank" ) // example is a training example. type example struct { transition features []int // features are used in the embeddings labels []int // labels are used in scoring the transitions } func makeExamples(sentenceTags []treebank.SentenceTag, conf NNConfig, dict *corpus.Corpus, ts []transition, f lingo.AnnotationFixer) []example { var examples []example var tarpit, nonprojective, good int for i, sentenceTag := range sentenceTags { exs, err := makeOneExample(i, sentenceTag, dict, ts, f) if err != nil { switch err.(type) { case TarpitError: tarpit++ case NonProjectiveError: nonprojective++ } } else { examples = append(examples, exs...) good++ } } logf("Number of SentenceTags Generated Into Examples: %d/%d | Number of Examples: %d | Number of nonprojective examples: %d | Number of tarpit examples: %d", good, len(sentenceTags), len(examples), nonprojective, tarpit) return examples } // makeOneExample is an example of a poorly named function. It makes an example from a SentenceTag func makeOneExample(i int, sentenceTag treebank.SentenceTag, dict *corpus.Corpus, ts []transition, f lingo.AnnotationFixer) ([]example, error) { var examples []example s := sentenceTag.AnnotatedSentence(f) dep := s.Dependency() if dep.IsProjective() { c := newConfiguration(s, true) count := 0 for !c.isTerminal() && count < 1000 { if count == 999 { return examples, TarpitError{c} } oracle := c.oracle(dep) features := getFeatures(c, dict) labels := make([]int, MAXTRANSITION) for i, t := range ts { if t == oracle { labels[i] = 1 } else if c.canApply(t) { labels[i] = 0 } else { labels[i] = -1 } } ex := example{transition{oracle.Move, oracle.DependencyType}, features, labels} examples = append(examples, ex) c.apply(oracle) count++ } } else { return nil, NonProjectiveError{dep} } return examples, nil } func shuffleExamples(a []example) { for i := range a { j := rand.Intn(i + 1) a[i], a[j] = a[j], a[i] } } ================================================ FILE: dep/example_test.go ================================================ package dep import ( "testing" "github.com/chewxy/lingo/corpus" ) func TestMakeExamples(t *testing.T) { st := simpleSentence() dict := corpus.GenerateCorpus(st) exs := makeExamples(st, DefaultNNConfig, dict, transitions, dummyFix{}) if len(exs) != 20 { t.Error("Expected 20 examples to be generated from simple sentence") } } ================================================ FILE: dep/featureExtraction.go ================================================ package dep import ( "github.com/chewxy/lingo" "github.com/chewxy/lingo/corpus" ) // getFeatures extracts the IDs to pass into the neural network. These IDs are used in the network to construct the input layers func getFeatures(c *configuration, dict *corpus.Corpus) []int { // logf("CONFIG: %v", c) wordFeats := make([]int, 0) posFeats := make([]lingo.POSTag, 0) labelFeats := make([]lingo.DependencyType, 0) unknownID, _ := dict.Id("-UNKNOWN-") for j := 2; j >= 0; j-- { index := c.stackValue(j) mor := c.annotation(index) if wordID, ok := dict.Id(mor.Value); ok { wordFeats = append(wordFeats, wordID) } else { wordFeats = append(wordFeats, unknownID) } posFeats = append(posFeats, mor.POSTag) } // logf("wordFeats: %v", wordFeats) for j := 0; j <= 2; j++ { index := c.bufferValue(j) mor := c.annotation(index) // logf("Want: %v Index: %d. Morpheme: %v", j, index, mor) if wordID, ok := dict.Id(mor.Value); ok { wordFeats = append(wordFeats, wordID) } else { wordFeats = append(wordFeats, unknownID) } posFeats = append(posFeats, mor.POSTag) } // logf("wordFeats: %v", wordFeats) for j := 0; j <= 1; j++ { k := c.stackValue(j) index := c.lc(k, 1) mor := c.annotation(index) if wordID, ok := dict.Id(mor.Value); ok { wordFeats = append(wordFeats, wordID) } else { wordFeats = append(wordFeats, unknownID) } posFeats = append(posFeats, mor.POSTag) labelFeats = append(labelFeats, c.label(index)) index = c.rc(k, 1) mor = c.annotation(index) if wordID, ok := dict.Id(mor.Value); ok { wordFeats = append(wordFeats, wordID) } else { wordFeats = append(wordFeats, unknownID) } posFeats = append(posFeats, mor.POSTag) labelFeats = append(labelFeats, c.label(index)) index = c.lc(k, 2) mor = c.annotation(index) if wordID, ok := dict.Id(mor.Value); ok { wordFeats = append(wordFeats, wordID) } else { wordFeats = append(wordFeats, unknownID) } posFeats = append(posFeats, mor.POSTag) labelFeats = append(labelFeats, c.label(index)) index = c.rc(k, 2) mor = c.annotation(index) if wordID, ok := dict.Id(mor.Value); ok { wordFeats = append(wordFeats, wordID) } else { wordFeats = append(wordFeats, unknownID) } posFeats = append(posFeats, mor.POSTag) labelFeats = append(labelFeats, c.label(index)) leftChild := c.lc(k, 1) index = c.lc(leftChild, 1) mor = c.annotation(index) if wordID, ok := dict.Id(mor.Value); ok { wordFeats = append(wordFeats, wordID) } else { wordFeats = append(wordFeats, unknownID) } posFeats = append(posFeats, mor.POSTag) labelFeats = append(labelFeats, c.label(index)) rightChild := c.rc(k, 1) index = c.rc(rightChild, 1) mor = c.annotation(index) if wordID, ok := dict.Id(mor.Value); ok { wordFeats = append(wordFeats, wordID) } else { wordFeats = append(wordFeats, unknownID) } posFeats = append(posFeats, mor.POSTag) labelFeats = append(labelFeats, c.label(index)) } // the embedding matrix is arranged thus: /* POSTag0 0, 1, ... 50 POSTag1 ... MAXTAG-1 DepType0 DepType1 ... MAXDEPTYPE-1 WordID0 ... WordIDN */ features := make([]int, MAXFEATURE) for i, w := range wordFeats { features[i] = w + wordFeatsStartAt } for i, t := range posFeats { features[i+POS_OFFSET] = int(t) } for i, l := range labelFeats { features[i+DEP_OFFSET] = int(l) + labelFeatsStartAt } return features } const ( POS_OFFSET int = 18 DEP_OFFSET = 36 STACK_OFFSET = 6 STACK_NUMBER = 6 ) ================================================ FILE: dep/features.go ================================================ package dep import "github.com/chewxy/lingo" // the features are used as columns in the matrix // go:generate stringer type=feature -output=feature_string.go type feature int const ( // first 18 are word related features // second 18 are POS related features // last 12 are label related features s0w feature = iota s1w s2w b0w b1w b2w s0l1w s0r1w s0l2w s0r2w s0llw s0rrw s1l1w s1r1w s1l2w s1r2w s1llw s1rrw // POS related words s0t s1t s2t b0t b1t b2t s0l1t s0r1t s0l2t s0r2t s0llt s0rrt s1l1t s1r1t s1l2t s1r2t s1llt s1rrt // label related s0l1d s0r1d s0l2d s0r2d s0lld s0rrd s1l1d s1r1d s1l2d s1r2d s1lld s1rrd MAXFEATURE ) const ( wordFeatsStartAt int = int(lingo.MAXTAG) + int(lingo.MAXDEPTYPE) labelFeatsStartAt = int(lingo.MAXTAG) posFeatsStartAt = 0 ) ================================================ FILE: dep/features_string.go ================================================ // generated by stringer -type=feature -output=features_string.go; DO NOT EDIT package dep import "fmt" const _feature_name = "s0ws1ws2wb0wb1wb2ws0l1ws0r1ws0l2ws0r2ws0llws0rrws1l1ws1r1ws1l2ws1r2ws1llws1rrws0ts1ts2tb0tb1tb2ts0l1ts0r1ts0l2ts0r2ts0llts0rrts1l1ts1r1ts1l2ts1r2ts1llts1rrts0l1ds0r1ds0l2ds0r2ds0llds0rrds1l1ds1r1ds1l2ds1r2ds1llds1rrdMAXFEATURE" var _feature_index = [...]uint8{0, 3, 6, 9, 12, 15, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68, 73, 78, 81, 84, 87, 90, 93, 96, 101, 106, 111, 116, 121, 126, 131, 136, 141, 146, 151, 156, 161, 166, 171, 176, 181, 186, 191, 196, 201, 206, 211, 216, 226} func (i feature) String() string { if i < 0 || i >= feature(len(_feature_index)-1) { return fmt.Sprintf("feature(%d)", i) } return _feature_name[_feature_index[i]:_feature_index[i+1]] } ================================================ FILE: dep/fix.go ================================================ package dep import ( "log" "github.com/chewxy/lingo" ) // applies common fixes func fix(d *lingo.Dependency) { // NNP fix: // If a sentence is [a, b, c, D, E, f, g] // where D, E are NNPs, they should be compound words // The head should be the one with higher headID spans := properNounSpans(d) for _, s := range spans { // we don't care about single word proper nouns if s.end-s.start <= 1 { continue } phrase := d.AnnotatedSentence[s.start:s.end] // pick up all compound roots // find annotations that do not have compound as deptype var compoundRoots lingo.AnnotationSet var problematic lingo.AnnotationSet for _, a := range phrase { if lingo.IsCompound(a.DependencyType) { compoundRoots = compoundRoots.Add(a.Head) } if !lingo.IsCompound(a.DependencyType) && a.ID != s.end-1 { problematic = problematic.Add(a) } } // if no root if len(compoundRoots) == 0 { // actual root is the word with the largest ID var compoundRoot *lingo.Annotation var rootRoot *lingo.Annotation for last := -1; s.end+last >= s.start; last-- { predictedRoot := s.end + last compoundRoot = d.AnnotatedSentence[predictedRoot] // incorrects : // dep==Dep // dep==Root && others has dep != root if compoundRoot.DependencyType == lingo.Dep { problematic = problematic.Add(compoundRoot) continue } if compoundRoot.DependencyType != lingo.Dep && compoundRoot.DependencyType != lingo.Root { break } if compoundRoot.DependencyType == lingo.Root { rootRoot = compoundRoot problematic = problematic.Add(compoundRoot) } } if rootRoot != nil && rootRoot != compoundRoot { // we have two potential roots. Choose the best log.Println("Problem when fixing: more than one possible compound root found") } for _, a := range problematic { if a == compoundRoot { continue } tmpHead := a.Head tmpRel := a.DependencyType a.SetHead(compoundRoot) a.DependencyType = lingo.Compound for _, childID := range d.AnnotatedSentence.Children(a.ID) { childA := d.AnnotatedSentence[childID] childA.SetHead(tmpHead) childA.DependencyType = tmpRel } } } // if more than one root... logf("More than zero compound roots not handled yet") } // Number fix } func properNounSpans(d *lingo.Dependency) (retVal []span) { start := -1 end := -1 for i, a := range d.AnnotatedSentence { if lingo.IsProperNoun(a.POSTag) { if start == -1 { start = i end = i + 1 } else { end = i + 1 } } else { if end == -1 { end = i } if start > -1 { s := makeSpan(start, end) retVal = append(retVal, s) } start = -1 end = -1 } } if start > -1 { s := makeSpan(start, len(d.AnnotatedSentence)) retVal = append(retVal, s) } return } ================================================ FILE: dep/init.go ================================================ package dep import "github.com/chewxy/lingo/corpus" func init() { c := corpus.New() c.Add("") // add null words KnownWords = c } ================================================ FILE: dep/models.go ================================================ package dep import ( "bufio" "bytes" "encoding/gob" "fmt" "io" "os" "github.com/chewxy/lingo/corpus" "github.com/pkg/errors" "gorgonia.org/tensor" ) // Model holds the neural network that a DependencyParser uses. To train, use a Trainer type Model struct { nn *neuralnetwork2 corpus *corpus.Corpus ts []transition } func (m *Model) Corpus() *corpus.Corpus { return m.corpus } func (m *Model) WordEmbeddings() *tensor.Dense { val := m.nn.e_w.Value().(*tensor.Dense) emb := val.Clone().(*tensor.Dense) return emb } func (m *Model) POSTagEmbeddings() *tensor.Dense { val := m.nn.e_t.Value().(*tensor.Dense) emb := val.Clone().(*tensor.Dense) return emb } func (m *Model) String() string { var buf bytes.Buffer buf.WriteString(m.nn.String()) buf.WriteString("Transitions: [") for _, t := range m.ts { fmt.Fprintf(&buf, "%v, ", t) } buf.WriteString("]") return buf.String() } func (m *Model) Save(filename string) error { if m.nn == nil { return errors.Errorf("Cannot save a model with no nn") } f, err := os.Create(filename) if err != nil { return err } return m.SaveWriter(f) } func (m *Model) SaveWriter(f io.WriteCloser) error { defer f.Close() w := bufio.NewWriter(f) defer w.Flush() encoder := gob.NewEncoder(w) if err := encoder.Encode(m.corpus); err != nil { return err } if err := encoder.Encode(m.nn); err != nil { return err } // if err := encoder.Encode(m.ts); err != nil { // return err // } return nil } func Load(filename string) (*Model, error) { f, err := os.Open(filename) if err != nil { return nil, err } return LoadReader(f) } func LoadReader(rd io.ReadCloser) (*Model, error) { defer rd.Close() r := bufio.NewReader(rd) decoder := gob.NewDecoder(r) m := new(Model) if err := decoder.Decode(&m.corpus); err != nil { return nil, err } m.nn = new(neuralnetwork2) m.nn.dict = m.corpus if err := decoder.Decode(&m.nn); err != nil { return nil, err } if err := decoder.Decode(&m.ts); err != nil { m.ts = transitions } m.nn.transitions = m.ts return m, nil } ================================================ FILE: dep/models_test.go ================================================ package dep import ( "os" "testing" "github.com/stretchr/testify/assert" G "gorgonia.org/gorgonia" ) func TestModel_SaveLoad(t *testing.T) { assert := assert.New(t) testFileName := "TestSave.dat" m := new(Model) // dumb shit if err := m.Save(testFileName); err == nil { t.Error("Expected an error") } conf := DefaultNNConfig conf.Dtype = G.Float32 m = new(Model) m.ts = transitions m.corpus = KnownWords m.nn = new(neuralnetwork2) m.nn.NNConfig = conf m.nn.dict = m.corpus if err := m.nn.init(); err != nil { t.Error(err) } if err := m.Save(testFileName); err != nil { t.Fatal(err) } var m2 *Model var err error if m2, err = Load(testFileName); err != nil { t.Error(err) } assert.Equal(m.corpus, m2.corpus, "Both Dependency Parsers need to have the same dict") if !G.ValueEq(m.nn.w2.Value(), m2.nn.w2.Value()) { t.Errorf("Expected w2 to be equal") } if !G.ValueEq(m.nn.e_w.Value(), m2.nn.e_w.Value()) { t.Errorf("Expected e_w to be equal") } // cleanup if err := os.Remove(testFileName); err != nil { t.Error(err) } } ================================================ FILE: dep/move.go ================================================ package dep // Move is an action that the dependency parser can take - whether to Shift, Attach-Left, or AttachRight type Move byte //go:generate stringer -type=Move const ( Shift Move = iota Left Right MAXMOVE ) // ALLMOVES is the set of all possible moves var ALLMOVES = [...]Move{Left, Right, Shift} ================================================ FILE: dep/move_string.go ================================================ // generated by stringer -type=Move; DO NOT EDIT package dep import "fmt" const _Move_name = "ShiftLeftRightMAXMOVE" var _Move_index = [...]uint8{0, 5, 9, 14, 21} func (i Move) String() string { if i >= Move(len(_Move_index)-1) { return fmt.Sprintf("Move(%d)", i) } return _Move_name[_Move_index[i]:_Move_index[i+1]] } ================================================ FILE: dep/nn2.go ================================================ package dep import ( "github.com/chewxy/lingo" "github.com/chewxy/lingo/corpus" "github.com/pkg/errors" G "gorgonia.org/gorgonia" "gorgonia.org/tensor" ) // may is a simple monad for handling errors type may struct { error n *G.Node } func (m *may) doUnary(fn func(*G.Node) (*G.Node, error)) { if m.error != nil { return } m.n, m.error = fn(m.n) } func (m *may) doBinary(fn func(a, b *G.Node) (*G.Node, error), other *G.Node) { if m.error != nil { return } m.n, m.error = fn(m.n, other) } func (m *may) doSwapBinary(fn func(a, b *G.Node) (*G.Node, error), other *G.Node) { if m.error != nil { return } m.n, m.error = fn(other, m.n) } type neuralnetwork2 struct { NNConfig g *G.ExprGraph sub *G.ExprGraph // model // embedding matrices for word, POSTags and labels respectively e_w *G.Node // Shape: (EmbeddingSize, DictSize) e_t *G.Node // Shape: (EmbeddingSize, lingo.MAXTAG) e_l *G.Node // Shape: (EmbeddingSize, lingo.MAXDEP) // w1 w1_w *G.Node // Shape: (HiddenSize, DictSize) w1_t *G.Node // Shape: (HiddenSize, lingo.MAXTAG) w1_l *G.Node // Shape: (HiddenSize, lingo.MAXDEP) b *G.Node // Shape: (HiddenSize) // w2 w2 *G.Node // Shape: (MAXTRANSITION, HiddenSize) // selects x_wSelW G.Nodes // 18 - word features x_tSelT G.Nodes // 18 - POSTag features x_lSelL G.Nodes // 12 - Dependency feature // inputs (feature vectors built up from the selects) x_w *G.Node x_t *G.Node x_l *G.Node // outputs scores *G.Node // argmax this to get the greedy decoded transition logProb *G.Node cost *G.Node costVal G.Value vm G.VM model G.Nodes solver G.Solver dict *corpus.Corpus transitions []transition costChan chan G.Value // wordfeats *G.Node // tagfeats *G.Node // depfeats *G.Node // sumfeats *G.Node // act *G.Node } func (nn *neuralnetwork2) initialized() bool { return nn.g != nil && nn.sub != nil && nn.e_w != nil && nn.e_t != nil && nn.e_l != nil && nn.w1_w != nil && nn.w1_t != nil && nn.w1_l != nil && nn.b != nil && nn.w2 != nil && len(nn.x_wSelW) > 0 && len(nn.x_tSelT) > 0 && len(nn.x_lSelL) > 0 && nn.x_w != nil && nn.x_t != nil && nn.x_l != nil && nn.scores != nil && nn.dict != nil && nn.vm != nil && nn.solver != nil } func (nn *neuralnetwork2) init() error { if nn.dict == nil { return errors.Errorf("No Corpus Provided to the Neural Network. Will be unable to decode") } g := G.NewGraph() nn.g = g word := nn.dict.Size() tags := int(lingo.MAXTAG) deps := int(lingo.MAXDEPTYPE) // trns := len(nn.transitions) wordFeats := POS_OFFSET - 0 tagFeats := DEP_OFFSET - POS_OFFSET depFeats := int(MAXFEATURE) - DEP_OFFSET // In any case a very very very small dict was passed in // we set the minimum to wordFeatss if word < wordFeats { word = wordFeats } logf(`Word: %d tags: %d deps: %d wordFeats: %d tagFeats: %d depFeats: %d `, word, tags, deps, wordFeats, tagFeats, depFeats) // define inputs nn.x_w = G.NewVector(g, nn.Dtype, G.WithShape(wordFeats*nn.EmbeddingSize), G.WithName("word input"), G.WithInit(G.Zeroes())) nn.x_t = G.NewVector(g, nn.Dtype, G.WithShape(tagFeats*nn.EmbeddingSize), G.WithName("POSTag input"), G.WithInit(G.Zeroes())) nn.x_l = G.NewVector(g, nn.Dtype, G.WithShape(depFeats*nn.EmbeddingSize), G.WithName("word input"), G.WithInit(G.Zeroes())) nn.x_wSelW = make(G.Nodes, wordFeats) nn.x_tSelT = make(G.Nodes, tagFeats) nn.x_lSelL = make(G.Nodes, depFeats) // define models nn.e_w = G.NewMatrix(g, nn.Dtype, G.WithShape(word, nn.EmbeddingSize), G.WithName("e_w"), G.WithInit(G.GlorotU(1))) nn.e_t = G.NewMatrix(g, nn.Dtype, G.WithShape(tags, nn.EmbeddingSize), G.WithName("e_t"), G.WithInit(G.GlorotU(1))) nn.e_l = G.NewMatrix(g, nn.Dtype, G.WithShape(deps, nn.EmbeddingSize), G.WithName("e_l"), G.WithInit(G.GlorotU(1))) nn.w1_w = G.NewMatrix(g, nn.Dtype, G.WithShape(nn.HiddenSize, nn.EmbeddingSize*wordFeats), G.WithName("w1_w"), G.WithInit(G.GlorotU(1))) nn.w1_t = G.NewMatrix(g, nn.Dtype, G.WithShape(nn.HiddenSize, nn.EmbeddingSize*tagFeats), G.WithName("w1_t"), G.WithInit(G.GlorotU(1))) nn.w1_l = G.NewMatrix(g, nn.Dtype, G.WithShape(nn.HiddenSize, nn.EmbeddingSize*depFeats), G.WithName("w1_l"), G.WithInit(G.GlorotU(1))) nn.b = G.NewVector(g, nn.Dtype, G.WithShape(nn.HiddenSize), G.WithName("b"), G.WithInit(G.Zeroes())) nn.w2 = G.NewMatrix(g, nn.Dtype, G.WithShape(MAXTRANSITION, nn.HiddenSize), G.WithName("w2"), G.WithInit(G.GlorotU(1))) nn.model = G.Nodes{nn.e_w, nn.e_t, nn.e_l, nn.w1_w, nn.w1_t, nn.w1_l, nn.b, nn.w2} // define selects // words first logf("nn.e_w: %+1.1s", nn.e_w.Value()) var err error for i := 0; i < wordFeats; i++ { if nn.x_wSelW[i], err = G.Slice(nn.e_w, G.S(i)); err != nil { // dummy slices... they'll be replaced at runtime return err } } // tag features for i := 0; i < tagFeats; i++ { if nn.x_tSelT[i], err = G.Slice(nn.e_t, G.S(i)); err != nil { // dummy slices... they'll be replaced at runtime return err } } // dependency features for i := 0; i < depFeats; i++ { if nn.x_lSelL[i], err = G.Slice(nn.e_l, G.S(i)); err != nil { return err } } // forwards if err = nn.fwd(); err != nil { return err } // backprop if _, err = G.Grad(nn.cost, nn.model...); err != nil { return err } nn.sub = g.SubgraphRoots(nn.scores) // prog, locmap, err := G.Compile(nn.g) // if err != nil { // return err // } // log.Printf("Prog: %v", prog) // ioutil.WriteFile("graph.dot", []byte(g.ToDot()), 0644) // logger := log.New(os.Stderr, "", 0) // nn.vm = G.NewTapeMachine(prog, locmap, G.BindDualValues(nn.model...), G.UseCudaFor(), G.WithLogger(logger), G.WithWatchlist()) // nn.vm = G.NewTapeMachine(prog, locmap, G.BindDualValues(nn.model...), G.UseCudaFor()) nn.vm = G.NewTapeMachine(nn.g, G.BindDualValues(nn.model...), G.UseCudaFor()) G.BindDualValues(nn.scores)(nn.vm) // makes sure that scores is a *dualValue nn.solver = G.NewAdaGradSolver(G.WithLearnRate(nn.AdaAlpha), G.WithEps(nn.AdaEps), G.WithL2Reg(nn.Reg), G.WithBatchSize(float64(nn.BatchSize))) // nn.solver = G.NewVanillaSolver(G.WithLearnRate(nn.AdaAlpha), G.WithL2Reg(nn.Reg)) return nil } func (nn *neuralnetwork2) fwd() error { var err error // build up x vectors if nn.x_w, err = G.Concat(0, nn.x_wSelW...); err != nil { return err } if nn.x_t, err = G.Concat(0, nn.x_tSelT...); err != nil { return err } if nn.x_l, err = G.Concat(0, nn.x_lSelL...); err != nil { return err } logf("w1_w %v, x_w %v", nn.w1_w.Shape(), nn.x_w.Shape()) m_w := &may{nil, nn.w1_w} m_w.doBinary(G.Mul, nn.x_w) if m_w.error != nil { return m_w.error } logf("w1_t %v, x_t %v", nn.w1_t.Shape(), nn.x_t.Shape()) m_t := &may{nil, nn.w1_t} m_t.doBinary(G.Mul, nn.x_t) if m_t.error != nil { return m_t.error } logf("w1_l %v, x_l %v", nn.w1_l.Shape(), nn.x_l.Shape()) m_l := &may{nil, nn.w1_l} m_l.doBinary(G.Mul, nn.x_l) if m_l.error != nil { return m_l.error } // add and activate layer 1 logf("w : %v", m_w.n.Shape()) m_w1 := &may{nil, m_w.n} m_w1.doBinary(G.Add, m_t.n) m_w1.doBinary(G.Add, m_l.n) m_w1.doBinary(G.Add, nn.b) m_w1.doUnary(G.Cube) if m_w1.error != nil { return m_w1.error } if nn.Dropout > 0 { logf("Doing dropout") m_w1.n, m_w1.error = G.Dropout(m_w1.n, nn.Dropout) if m_w1.error != nil { return m_w1.error } } // go to softmax layer logf("w2: %v, w1act: %v", nn.w2.Shape(), m_w1.n.Shape()) m_sm := &may{nil, nn.w2} m_sm.doBinary(G.Mul, m_w1.n) nn.scores = m_sm.n m_sm.doUnary(G.SoftMax) if m_sm.error != nil { return m_sm } nn.logProb = m_sm.n // G.WithName("Logprob")(nn.logProb) // log.Printf("LOGPROB %v %p %v", nn.logProb, nn.logProb, nn.logProb) if nn.cost, err = G.Slice(nn.logProb, G.S(0)); err != nil { // slice is a dummy tensor.Slice. It'll be replaced at runtime return err } G.Read(nn.cost, &nn.costVal) return nil } func (nn *neuralnetwork2) costProgress() <-chan G.Value { if nn.costChan == nil { nn.costChan = make(chan G.Value) } return nn.costChan } // train does one epoch of training. The examples are batched. func (nn *neuralnetwork2) train(examples []example) error { size := len(examples) batches := size / nn.BatchSize var start, end int if nn.BatchSize > size { batches = 1 end = size G.WithBatchSize(float64(size))(nn.solver) // set it such that the solver doesn't get confused } else { end = nn.BatchSize } for batch := 0; batch < batches; batch++ { for _, ex := range examples[start:end] { nn.feats2vec(ex.features) tid := lookupTransition(ex.transition, nn.transitions) if err := G.UnsafeLet(nn.cost, G.S(tid)); err != nil { return err } if err := nn.vm.RunAll(); err != nil { return err } nn.vm.Reset() } if err := nn.solver.Step(G.NodesToValueGrads(nn.model)); err != nil { err = errors.Wrapf(err, "Stepping on the model failed %v", batch) return err } if nn.costChan != nil { nn.costChan <- nn.costVal } start = end if start >= size { break } end += nn.BatchSize if end >= size { end = size } } return nil } // pred predicts the index of the transitions func (nn *neuralnetwork2) pred(ind []int) (int, error) { nn.feats2vec(ind) // f, _ := os.OpenFile("LOOOOOG", os.O_APPEND|os.O_CREATE|os.O_RDWR, 0644) // logger := log.New(f, "", 0) // logger := log.New(os.Stderr, "", 0) // m := G.NewLispMachine(nn.sub, G.ExecuteFwdOnly(), G.WithLogger(logger), G.WithWatchlist(), G.LogBothDir(), G.WithValueFmt("%+3.3v")) m := G.NewLispMachine(nn.sub, G.ExecuteFwdOnly()) if err := m.RunAll(); err != nil { return 0, err } // logger.Println("========================\n") val := nn.scores.Value().(tensor.Tensor) t, err := tensor.Argmax(val, tensor.AllAxes) if err != nil { return 0, err } return t.ScalarValue().(int), nil } // utility function func (nn *neuralnetwork2) feats2vec(indicators []int) error { // fix word features for i, ind := range indicators[:POS_OFFSET] { if err := G.UnsafeLet(nn.x_wSelW[i], G.S(ind-wordFeatsStartAt)); err != nil { return err } } // fix tag features for i, ind := range indicators[POS_OFFSET:DEP_OFFSET] { if err := G.UnsafeLet(nn.x_tSelT[i], G.S(ind)); err != nil { return err } } for i, ind := range indicators[DEP_OFFSET:] { if err := G.UnsafeLet(nn.x_lSelL[i], G.S(ind-labelFeatsStartAt)); err != nil { return err } } return nil } ================================================ FILE: dep/nn2_io.go ================================================ package dep import ( "bytes" "encoding/gob" "fmt" "github.com/pkg/errors" G "gorgonia.org/gorgonia" T "gorgonia.org/tensor" ) var empty struct{} func (nn *neuralnetwork2) String() string { s := `Config ------ %v Info ------ Embeddings_Word : %v Embeddings_POStag : %v Embeddings_Dependency : %v Selects_Words : %d Selects_POSTag : %d Selects_Dependency : %d Weights1_Word : %v Weights1_POSTag : %v Weights1_Dependency : %v Biases : %v Weights2 : %v ` return fmt.Sprintf(s, nn.NNConfig, nn.e_w.Shape(), nn.e_t.Shape(), nn.e_l.Shape(), len(nn.x_wSelW), len(nn.x_tSelT), len(nn.x_lSelL), nn.w1_w.Shape(), nn.w1_t.Shape(), nn.w1_l.Shape(), nn.b.Shape(), nn.w2.Shape()) } func (nn *neuralnetwork2) GobEncode() ([]byte, error) { if !nn.initialized() { return nil, errors.Errorf("Neural network not initialized. Cannot gob") } var buf bytes.Buffer encoder := gob.NewEncoder(&buf) if err := encoder.Encode(nn.NNConfig); err != nil { return nil, err } if err := encoder.Encode(nn.e_w.Value()); err != nil { return nil, err } if err := encoder.Encode(nn.e_t.Value()); err != nil { return nil, err } if err := encoder.Encode(nn.e_l.Value()); err != nil { return nil, err } if err := encoder.Encode(nn.w1_w.Value()); err != nil { return nil, err } if err := encoder.Encode(nn.w1_t.Value()); err != nil { return nil, err } if err := encoder.Encode(nn.w1_l.Value()); err != nil { return nil, err } if err := encoder.Encode(nn.b.Value()); err != nil { return nil, err } if err := encoder.Encode(nn.w2.Value()); err != nil { return nil, err } return buf.Bytes(), nil } func (nn *neuralnetwork2) GobDecode(buf []byte) error { // prechecks if nn.dict == nil { return errors.Errorf("Neural Network has no corpus attached to it (Corpuses are serialized separately).") } b := bytes.NewBuffer(buf) decoder := gob.NewDecoder(b) if err := decoder.Decode(&nn.NNConfig); err != nil { return err } if err := nn.init(); err != nil { return err } e_w := T.New(T.Of(nn.Dtype), T.WithShape(nn.e_w.Shape()...)) if err := decoder.Decode(e_w); err != nil { return err } G.Let(nn.e_w, e_w) e_t := T.New(T.Of(nn.Dtype), T.WithShape(nn.e_t.Shape()...)) if err := decoder.Decode(e_t); err != nil { return err } G.Let(nn.e_t, e_t) e_l := T.New(T.Of(nn.Dtype), T.WithShape(nn.e_l.Shape()...)) if err := decoder.Decode(e_l); err != nil { return err } G.Let(nn.e_l, e_l) w1_w := T.New(T.Of(nn.Dtype), T.WithShape(nn.w1_w.Shape()...)) if err := decoder.Decode(w1_w); err != nil { return err } G.Let(nn.w1_w, w1_w) w1_t := T.New(T.Of(nn.Dtype), T.WithShape(nn.w1_t.Shape()...)) if err := decoder.Decode(w1_t); err != nil { return err } G.Let(nn.w1_t, w1_t) w1_l := T.New(T.Of(nn.Dtype), T.WithShape(nn.w1_l.Shape()...)) if err := decoder.Decode(w1_l); err != nil { return err } G.Let(nn.w1_l, w1_l) bias := T.New(T.Of(nn.Dtype), T.WithShape(nn.b.Shape()...)) if err := decoder.Decode(bias); err != nil { return err } G.Let(nn.b, bias) w2 := T.New(T.Of(nn.Dtype), T.WithShape(nn.w2.Shape()...)) if err := decoder.Decode(w2); err != nil { return err } G.Let(nn.w2, w2) return nil } ================================================ FILE: dep/nn2_io_test.go ================================================ package dep import ( "bytes" "encoding/gob" "fmt" "testing" "github.com/chewxy/lingo" "github.com/chewxy/lingo/corpus" G "gorgonia.org/gorgonia" ) func TestNNIO(t *testing.T) { sts := allSentences() nn := new(neuralnetwork2) nn.NNConfig = DefaultNNConfig nn.dict = corpus.GenerateCorpus(sts) nn.transitions = transitions if err := nn.init(); err != nil { t.Fatalf("%+v", err) } s := `Config ------ Batch Size : 10000 Dropout Rate : 0.500000 AdaGrad Eps (ε) : 0.000001 AdaGrad Learn Rate (η) : 0.010000 Regularization Parameter : 0.000002 Hidden Layer Size : 200 Embedding Size : 50 Number Precomputed : 30000 Evaluate Per 100 Iterations Clear Gradients Per 0 Iterations Dtype: float64 Info ------ Embeddings_Word : (74, 50) Embeddings_POStag : (%d, 50) Embeddings_Dependency : (%d, 50) Selects_Words : 18 Selects_POSTag : 18 Selects_Dependency : 12 Weights1_Word : (200, 900) Weights1_POSTag : (200, 900) Weights1_Dependency : (200, 600) Biases : (200) Weights2 : (%d, 200) ` correctDesc := fmt.Sprintf(s, lingo.MAXTAG, lingo.MAXDEPTYPE, MAXTRANSITION) if nn.String() != correctDesc { t.Errorf("Oops. Got %q. Want %q", nn.String(), correctDesc) } // nn.Dtype = tensor.Float32 var buf bytes.Buffer encoder := gob.NewEncoder(&buf) if err := encoder.Encode(nn); err != nil { t.Fatalf("%+v", err) } decoder := gob.NewDecoder(&buf) nn2 := new(neuralnetwork2) nn2.dict = corpus.GenerateCorpus(sts) nn2.transitions = transitions if err := decoder.Decode(nn2); err != nil { t.Fatal(err) } if nn.String() != correctDesc { t.Fatalf("Oops. Got %q. Want %q", nn.String(), correctDesc) } if !G.ValueEq(nn.e_w.Value(), nn2.e_w.Value()) { t.Errorf("Expected e_w to be the same. Expected %1.1s. Got %1.1s", nn.e_w.Value(), nn2.e_w.Value()) } if !G.ValueEq(nn.e_t.Value(), nn2.e_t.Value()) { t.Errorf("Expected e_t to be the same. Expected %1.1s. Got %1.1s", nn.e_t.Value(), nn2.e_t.Value()) } if !G.ValueEq(nn.e_l.Value(), nn2.e_l.Value()) { t.Errorf("Expected e_l to be the same. Expected %1.1s. Got %1.1s", nn.e_l.Value(), nn2.e_l.Value()) } if !G.ValueEq(nn.w1_w.Value(), nn2.w1_w.Value()) { t.Errorf("Expected w1_w to be the same. Expected %1.1s. Got %1.1s", nn.w1_w.Value(), nn2.w1_w.Value()) } if !G.ValueEq(nn.w1_t.Value(), nn2.w1_t.Value()) { t.Errorf("Expected w1_t to be the same. Expected %1.1s. Got %1.1s", nn.w1_t.Value(), nn2.w1_t.Value()) } if !G.ValueEq(nn.w1_l.Value(), nn2.w1_l.Value()) { t.Errorf("Expected w1_l to be the same. Expected %1.1s. Got %1.1s", nn.w1_l.Value(), nn2.w1_l.Value()) } if !G.ValueEq(nn.b.Value(), nn2.b.Value()) { t.Errorf("Expected b to be the same. Expected %1.1s. Got %1.1s", nn.b.Value(), nn2.b.Value()) } if !G.ValueEq(nn.w2.Value(), nn2.w2.Value()) { t.Errorf("Expected w2 to be the same. Expected %1.1s. Got %1.1s", nn.w2.Value(), nn2.w2.Value()) } t.Logf("Visual Inspection: \n%+1.8s\n%+1.8s", nn.e_w.Value(), nn2.e_w.Value()) // special case buf.Reset() encoder = gob.NewEncoder(&buf) if err := encoder.Encode(nn); err != nil { t.Fatalf("%+v", err) } decoder = gob.NewDecoder(&buf) nn3 := new(neuralnetwork2) if err := decoder.Decode(nn3); err == nil { t.Error("Expected a nocorpus error") } } ================================================ FILE: dep/nn2_test.go ================================================ package dep import ( "math/rand" "testing" "time" "github.com/chewxy/lingo/corpus" "gorgonia.org/gorgonia" ) func TestNN2(t *testing.T) { rand.Seed(1337) // we test 50 iterations unless the short flag is passed in epochs := 50 if testing.Short() { epochs = 10 } sts := allSentences() nn := new(neuralnetwork2) nn.NNConfig = DefaultNNConfig nn.Dtype = gorgonia.Float32 nn.dict = corpus.GenerateCorpus(sts) nn.transitions = transitions if err := nn.init(); err != nil { t.Fatalf("%+v", err) } var costs []float64 ch := nn.costProgress() sigChan := make(chan struct{}) go func(ch <-chan gorgonia.Value, sig chan struct{}) { for cost := range ch { switch c := cost.Data().(type) { case float32: costs = append(costs, float64(c)) case float64: costs = append(costs, c) } t.Logf("Cost %v", cost) } sig <- struct{}{} }(ch, sigChan) exs := makeExamples(sts, nn.NNConfig, nn.dict, transitions, dummyFix{}) start := time.Now() for i := 0; i < epochs; i++ { if err := nn.train(exs); err != nil { t.Errorf("%+v", err) } shuffleExamples(exs) } // simulate what *DependencyParser would do close(nn.costChan) nn.costChan = nil t.Logf("Training %d iterations took Taken: %v", epochs, time.Since(start)) <-sigChan if len(costs) == 0 { t.Error("Expected some costs") } if costs[0] <= costs[len(costs)-1] { t.Error("Expected costs to have reduced during training") } // PREDICTION TIME! ss2 := simpleSentence() exs = makeExamples(ss2, nn.NNConfig, nn.dict, transitions, dummyFix{}) start = time.Now() for i, ex := range exs { ind, err := nn.pred(ex.features) if err != nil { t.Errorf("Example %d failed: %v", i, err) continue } t.Logf("Example %d. Want: %v. Got %v. Same: %t", i, ex.transition, transitions[ind], ex.transition == transitions[ind]) } t.Logf("Pred Time Taken: %v", time.Since(start)) } ================================================ FILE: dep/nnconfig.go ================================================ package dep import ( "bytes" "encoding/gob" "fmt" "github.com/pkg/errors" "gorgonia.org/tensor" ) // NNConfig configures the neural network type NNConfig struct { BatchSize int // 10000 Dropout float64 // 0.5 AdaEps float64 // 1e-6 AdaAlpha float64 //0.02 Reg float64 // 1e-8 HiddenSize int // 200 EmbeddingSize int // 50 NumPrecomputed int //100000 EvalPerIteration int // 100 ClearGradientsPerIteration int // 0 Dtype tensor.Dtype } func (c NNConfig) String() string { s := `Batch Size : %d Dropout Rate : %f AdaGrad Eps (ε) : %f AdaGrad Learn Rate (η) : %f Regularization Parameter : %f Hidden Layer Size : %d Embedding Size : %d Number Precomputed : %d Evaluate Per %d Iterations Clear Gradients Per %d Iterations Dtype: %v ` return fmt.Sprintf(s, c.BatchSize, c.Dropout, c.AdaEps, c.AdaAlpha, c.Reg, c.HiddenSize, c.EmbeddingSize, c.NumPrecomputed, c.EvalPerIteration, c.ClearGradientsPerIteration, c.Dtype) } // DefaultNNConfig is the default config that is passed in, for initialization purposses. var DefaultNNConfig NNConfig func (c NNConfig) GobEncode() ([]byte, error) { var buf bytes.Buffer encoder := gob.NewEncoder(&buf) encoder.Encode(c.BatchSize) encoder.Encode(c.Dropout) encoder.Encode(c.AdaEps) encoder.Encode(c.AdaAlpha) encoder.Encode(c.Reg) encoder.Encode(c.HiddenSize) encoder.Encode(c.EmbeddingSize) encoder.Encode(c.NumPrecomputed) encoder.Encode(c.EvalPerIteration) encoder.Encode(c.ClearGradientsPerIteration) switch c.Dtype { case tensor.Float64: encoder.Encode(byte(0)) case tensor.Float32: encoder.Encode(byte(1)) default: return nil, errors.Errorf("Unsupported Dtype to be GobEncoded") } return buf.Bytes(), nil } func (c *NNConfig) GobDecode(p []byte) error { b := bytes.NewBuffer(p) decoder := gob.NewDecoder(b) decoder.Decode(&c.BatchSize) decoder.Decode(&c.Dropout) decoder.Decode(&c.AdaEps) decoder.Decode(&c.AdaAlpha) decoder.Decode(&c.Reg) decoder.Decode(&c.HiddenSize) decoder.Decode(&c.EmbeddingSize) decoder.Decode(&c.NumPrecomputed) decoder.Decode(&c.EvalPerIteration) decoder.Decode(&c.ClearGradientsPerIteration) var bite byte decoder.Decode(&bite) switch bite { case 0: c.Dtype = tensor.Float64 case 1: c.Dtype = tensor.Float32 default: return errors.Errorf("Unsupported Dtype to be GobDecoded: %v", bite) } return nil } func init() { DefaultNNConfig = NNConfig{ BatchSize: 10000, Dropout: 0.5, AdaEps: 1e-6, AdaAlpha: 0.01, Reg: 1.5e-6, HiddenSize: 200, EmbeddingSize: 50, NumPrecomputed: 30000, EvalPerIteration: 100, ClearGradientsPerIteration: 0, Dtype: tensor.Float64, // Dtype: gorgonia.Float32, } } ================================================ FILE: dep/release.go ================================================ // +build !debug package dep const BUILD_DEBUG = "PARSER: RELEASE BUILD" const BUILD_DIAG = "Non-Diagnostic Build" const DEBUG = false var READMEMSTATS = false var TABCOUNT uint32 = 0 func enterLoggingContext() {} func leaveLoggingContext() {} func logTrainingProgress(iteration, correct, total, length, possibles int) {} func logMemStats() {} func logf(format string, others ...interface{}) {} func recoverFrom(format string, attrs ...interface{}) {} func (d *Parser) SprintFeatures(feature []int) string { return "" } func SprintScores(scores []float64, ts []transition) string { return "" } ================================================ FILE: dep/span.go ================================================ package dep type span struct { start, end int } func makeSpan(start, end int) span { if end <= start { panic("Impossible span created") } return span{start, end} } func (s span) combine(other span) span { start := minInt(s.start, other.start) end := maxInt(s.end, other.end) return span{start, end} } ================================================ FILE: dep/test_test.go ================================================ package dep import ( "bufio" "crypto/md5" "encoding/gob" "fmt" "io" "log" "os" "strings" "github.com/chewxy/lingo" "github.com/chewxy/lingo/treebank" "github.com/kljensen/snowball" ) type dummyLem struct{} func (dummyLem) Lemmatize(s string, pt lingo.POSTag) ([]string, error) { return nil, componentUnavailable("lemmatizer") } type dummyStemmer struct{} func (dummyStemmer) Stem(s string) (string, error) { return snowball.Stem(s, "english", true) } type dummyFix struct { dummyStemmer dummyLem } func (dummyFix) Clusters() (map[string]lingo.Cluster, error) { return nil, componentUnavailable("clusters") } const nnps = `1 Guerrillas guerrilla NOUN NNS Number=Plur 2 nsubj _ _ 2 threatened threaten VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ _ 3 to to PART TO _ 4 mark _ _ 4 assassinate assassinate VERB VB VerbForm=Inf 2 xcomp _ _ 5 Prime Prime PROPN NNP Number=Sing 6 compound _ _ 6 Minister Minister PROPN NNP Number=Sing 8 compound _ _ 7 Iyad Iyad PROPN NNP Number=Sing 8 compound _ _ 8 Allawi Allawi PROPN NNP Number=Sing 4 dobj _ _ 9 and and CONJ CC _ 8 cc _ _ 10 Minister Minister PROPN NNP Number=Sing 14 compound _ _ 11 of of ADP IN _ 12 case _ _ 12 Defense Defense PROPN NNP Number=Sing 10 nmod _ _ 13 Hazem Hazem PROPN NNP Number=Sing 14 compound _ _ 14 Shaalan Shaalan PROPN NNP Number=Sing 8 conj _ _ 15 in in ADP IN _ 16 case _ _ 16 retaliation retaliation NOUN NN Number=Sing 4 nmod _ _ 17 for for ADP IN _ 19 case _ _ 18 the the DET DT Definite=Def|PronType=Art 19 det _ _ 19 attack attack NOUN NN Number=Sing 16 nmod _ _ 20 . . PUNCT . _ 2 punct _ _ ` const simple = `1 Yet yet CONJ CC _ 5 cc _ _ 2 we we PRON PRP Case=Nom|Number=Plur|Person=1|PronType=Prs 5 nsubj _ _ 3 did do AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 5 aux _ _ 4 n't not PART RB _ 5 neg _ _ 5 charge charge VERB VB VerbForm=Inf 0 root _ _ 6 them they PRON PRP Case=Acc|Number=Plur|Person=3|PronType=Prs 5 dobj _ _ 7 for for ADP IN _ 9 case _ _ 8 the the DET DT Definite=Def|PronType=Art 9 det _ _ 9 evacuation evacuation NOUN NN Number=Sing 5 nmod _ _ 10 . . PUNCT . _ 5 punct _ _ ` const med = `1 President President PROPN NNP Number=Sing 2 compound _ _ 2 Bush Bush PROPN NNP Number=Sing 5 nsubj _ _ 3 on on ADP IN _ 4 case _ _ 4 Tuesday Tuesday PROPN NNP Number=Sing 5 nmod _ _ 5 nominated nominate VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ _ 6 two two NUM CD NumType=Card 7 nummod _ _ 7 individuals individual NOUN NNS Number=Plur 5 dobj _ _ 8 to to PART TO _ 9 mark _ _ 9 replace replace VERB VB VerbForm=Inf 5 advcl _ _ 10 retiring retire VERB VBG VerbForm=Ger 11 amod _ _ 11 jurists jurist NOUN NNS Number=Plur 9 dobj _ _ 12 on on ADP IN _ 14 case _ _ 13 federal federal ADJ JJ Degree=Pos 14 amod _ _ 14 courts court NOUN NNS Number=Plur 11 nmod _ _ 15 in in ADP IN _ 18 case _ _ 16 the the DET DT Definite=Def|PronType=Art 18 det _ _ 17 Washington Washington PROPN NNP Number=Sing 18 compound _ _ 18 area area NOUN NN Number=Sing 14 nmod _ _ 19 . . PUNCT . _ 5 punct _ _ ` const long = `1 Now now ADV RB _ 5 advmod _ _ 2 , , PUNCT , _ 5 punct _ _ 3 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 5 nsubj _ _ 4 would would AUX MD VerbForm=Fin 5 aux _ _ 5 argue argue VERB VB VerbForm=Inf 0 root _ _ 6 that that SCONJ IN _ 11 mark _ _ 7 one one PRON PRP _ 11 nsubj _ _ 8 could could AUX MD VerbForm=Fin 11 aux _ _ 9 have have AUX VB VerbForm=Inf 11 aux _ _ 10 reasonably reasonably ADV RB _ 11 advmod _ _ 11 predicted predict VERB VBN Tense=Past|VerbForm=Part 5 ccomp _ _ 12 that that SCONJ IN _ 19 mark _ _ 13 some some DET DT _ 14 det _ _ 14 form form NOUN NN Number=Sing 19 nsubj _ _ 15 of of ADP IN _ 17 case _ _ 16 military military ADJ JJ Degree=Pos 17 amod _ _ 17 violence violence NOUN NN Number=Sing 14 nmod _ _ 18 was be VERB VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 19 cop _ _ 19 likely likely ADJ JJ Degree=Pos 11 ccomp _ _ 20 to to PART TO _ 21 mark _ _ 21 occur occur VERB VB VerbForm=Inf 19 xcomp _ _ 22 in in ADP IN _ 23 case _ _ 23 Lebanon Lebanon PROPN NNP Number=Sing 21 nmod _ _ 24 -LRB- -lrb- PUNCT -LRB- _ 25 punct _ _ 25 considering consider VERB VBG VerbForm=Ger 19 advcl _ _ 26 that that SCONJ IN _ 31 mark _ _ 27 the the DET DT Definite=Def|PronType=Art 28 det _ _ 28 country country NOUN NN Number=Sing 31 nsubj _ _ 29 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 31 aux _ _ 30 been be AUX VBN Tense=Past|VerbForm=Part 31 aux _ _ 31 experiencing experience VERB VBG Tense=Pres|VerbForm=Part 25 ccomp _ _ 32 some some DET DT _ 33 det _ _ 33 form form NOUN NN Number=Sing 31 dobj _ _ 34 of of ADP IN _ 35 case _ _ 35 conflict conflict NOUN NN Number=Sing 33 nmod _ _ 36 for for ADP IN _ 41 case _ _ 37 approximately approximately ADV RB _ 41 advmod _ _ 38 the the DET DT Definite=Def|PronType=Art 41 det _ _ 39 last last ADJ JJ Degree=Pos 41 amod _ _ 40 32 32 NUM CD NumType=Card 41 nummod _ _ 41 years year NOUN NNS Number=Plur 31 nmod _ _ 42 -RRB- -rrb- PUNCT -RRB- _ 25 punct _ _ 43 . . PUNCT . _ 5 punct _ _ ` const cvconllu = `1 Google Google PROPN NNP Number=Sing 6 nsubj _ _ 2 is be VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 cop _ _ 3 a a DET DT Definite=Ind|PronType=Art 6 det _ _ 4 nice nice ADJ JJ Degree=Pos 6 amod _ _ 5 search search NOUN NN Number=Sing 6 compound _ _ 6 engine engine NOUN NN Number=Sing 0 root _ _ 7 . . PUNCT . _ 6 punct _ _ 1 Does do AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 aux _ _ 2 anybody anybody NOUN NN Number=Sing 3 nsubj _ _ 3 use use VERB VB VerbForm=Inf 0 root _ _ 4 it it PRON PRP Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs 3 dobj _ _ 5 for for ADP IN _ 6 case _ _ 6 anything anything NOUN NN Number=Sing 3 nmod _ _ 7 else else ADJ JJ Degree=Pos 6 amod _ _ 8 ? ? PUNCT . _ 3 punct _ _ ` func lotsaNNP() *lingo.Dependency { readr := strings.NewReader(nnps) sentenceTags := treebank.ReadConllu(readr) return sentenceTags[0].Dependency(dummyFix{}) } // simpleSentence has 10 words func simpleSentence() []treebank.SentenceTag { readr := strings.NewReader(simple) return treebank.ReadConllu(readr) } func mediumSentence() []treebank.SentenceTag { readr := strings.NewReader(med) return treebank.ReadConllu(readr) } // longSentence has 44 words func longSentence() []treebank.SentenceTag { readr := strings.NewReader(long) return treebank.ReadConllu(readr) } func allSentences() []treebank.SentenceTag { sentenceTags := treebank.ReadConllu(strings.NewReader(nnps)) sentenceTags = append(sentenceTags, treebank.ReadConllu(strings.NewReader(simple))...) sentenceTags = append(sentenceTags, treebank.ReadConllu(strings.NewReader(med))...) sentenceTags = append(sentenceTags, treebank.ReadConllu(strings.NewReader(long))...) return sentenceTags } func cvSentences() []treebank.SentenceTag { return treebank.ReadConllu(strings.NewReader(cvconllu)) } func hash(s string) string { h := md5.New() io.WriteString(h, s) return fmt.Sprintf("%x", h.Sum(nil)) } func cache(input string, s lingo.AnnotatedSentence) { hashfilename := "cached/" + hash(input) + ".cached" f, err := os.Create(hashfilename) if err != nil { log.Fatal(err) } defer f.Close() w := bufio.NewWriter(f) defer w.Flush() encoder := gob.NewEncoder(w) if err := encoder.Encode(s); err != nil { log.Fatal(err) } } func useCached(filename string) *lingo.Dependency { f, err := os.Open(filename) if err != nil { log.Fatal(err) } defer f.Close() r := bufio.NewReader(f) decoder := gob.NewDecoder(r) var sentence lingo.AnnotatedSentence if err := decoder.Decode(&sentence); err != nil { log.Fatal(err) } // fixes ID and what nots sentence.Fix() dep := sentence.Dependency() return dep } ================================================ FILE: dep/train.go ================================================ package dep import ( "fmt" "os" "sync" "github.com/chewxy/lingo" "github.com/chewxy/lingo/corpus" "github.com/chewxy/lingo/treebank" "github.com/pkg/errors" ) // TrainerConsOpt is a construction option for trainer type TrainerConsOpt func(t *Trainer) // WithTrainingModel loads a trainer with a model func WithTrainingModel(m *Model) TrainerConsOpt { f := func(t *Trainer) { t.Model = m } return f } // WithTrainingSet creates a trainer with a training set func WithTrainingSet(st []treebank.SentenceTag) TrainerConsOpt { f := func(t *Trainer) { t.trainingSet = st } return f } // WithCrossValidationSet creates a trainer with a cross validation set func WithCrossValidationSet(st []treebank.SentenceTag) TrainerConsOpt { f := func(t *Trainer) { t.crossValSet = st } return f } // WithConfig sets up a *Trainer with a NNConfig func WithConfig(conf NNConfig) TrainerConsOpt { f := func(t *Trainer) { t.nn.NNConfig = conf t.nn.dict = t.corpus t.nn.transitions = t.ts t.EvalPerIter = conf.EvalPerIteration } return f } // WithLemmatizer sets the lemmatizer option on the Trainer func WithLemmatizer(l lingo.Lemmatizer) TrainerConsOpt { f := func(t *Trainer) { // cannot pass in itself! if T, ok := l.(*Trainer); ok && T == t { panic("Recursive definition of lemmatizer (trying to set the t.lemmatizer = T) !") } t.l = l } return f } // WithStemmer sets up the stemmer option on the DependencyParser func WithStemmer(s lingo.Stemmer) TrainerConsOpt { f := func(t *Trainer) { // cannot pass in itself if T, ok := s.(*Trainer); ok && T == t { panic("Recursive setting of stemmer! (Trying to set t.stemmer = T)") } t.s = s } return f } // WithCluster sets the brown cluster options for the DependencyParser func WithCluster(c map[string]lingo.Cluster) TrainerConsOpt { f := func(t *Trainer) { t.c = c } return f } // WithCorpus creates a Trainer with a corpus func WithCorpus(c *corpus.Corpus) TrainerConsOpt { f := func(t *Trainer) { t.corpus = c t.nn.dict = c } return f } // WithGeneratedCorpus creates a Trainer's corpus from a list of SentenceTags. The corpus will be generated from the SentenceTags func WithGeneratedCorpus(sts ...treebank.SentenceTag) TrainerConsOpt { f := func(t *Trainer) { dict := corpus.GenerateCorpus(sts) if t.corpus == nil { t.corpus = dict } else { t.corpus.Merge(dict) } t.nn.dict = t.corpus } return f } // Trainer trains a model type Trainer struct { trainingSet []treebank.SentenceTag crossValSet []treebank.SentenceTag once sync.Once *Model // Training configuration EvalPerIter int // for cross validation - evaluate results every n epochs PassDirect bool // Pass on the costs directly to the cost channel? If false, an average will be used SaveBest string // SaveBest is the filename that will be saved. If it's empty then the best-while-training will not be saved // fixer l lingo.Lemmatizer s lingo.Stemmer c map[string]lingo.Cluster err chan error cost chan float64 perf chan Performance } // NewTrainer creates a new Trainer. func NewTrainer(opts ...TrainerConsOpt) *Trainer { t := new(Trainer) // set up the default model t.Model = new(Model) t.corpus = KnownWords t.ts = transitions // set up the neural network t.nn = new(neuralnetwork2) t.nn.NNConfig = DefaultNNConfig t.nn.transitions = transitions t.nn.dict = KnownWords for _, opt := range opts { opt(t) } return t } // Lemmatize implemnets lingo.Lemmatizer func (t *Trainer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) { if t.l == nil { return nil, componentUnavailable("Lemmatizer") } return t.l.Lemmatize(a, pt) } // Stem implements lingo.Stemmer func (t *Trainer) Stem(a string) (string, error) { if t.s == nil { return "", componentUnavailable("Stemmer") } return t.s.Stem(a) } // Clusters implements lingo.Fixer func (t *Trainer) Clusters() (map[string]lingo.Cluster, error) { if t.c == nil { return nil, componentUnavailable("Clusters") } return t.c, nil } /* Getters */ // Cost returns a channel of costs for monitoring the training. If the PassDirect field in the trainer is set to true // then the costs are directly returned. Otherwise the costs are averaged over the epoch. func (t *Trainer) Cost() <-chan float64 { if t.cost == nil { t.cost = make(chan float64) } return t.cost } // Perf returns a channel of Performance for monitoring the training. func (t *Trainer) Perf() <-chan Performance { if t.perf == nil { t.perf = make(chan Performance) } return t.perf } /* Methods */ // Init initializes the DependencyParser with a corpus and a neural network config func (t *Trainer) Init() (err error) { f := func() { err = t.nn.init() } t.once.Do(f) return } // Train trains a model. // // If a cross validation set is provided, it will automatically train with the cross validation set func (t *Trainer) Train(epochs int) error { if err := t.pretrainCheck(); err != nil { return err } if len(t.crossValSet) > 0 { return t.crossValidateTrain(epochs) } return t.train(epochs) } // TrainWithoutCrossValidation trains a model without cross validation. func (t *Trainer) TrainWithoutCrossValidation(epochs int) error { return t.train(epochs) } // train simply trains the model without having a cross validation. func (t *Trainer) train(epochs int) error { var epochChan chan struct{} if t.cost != nil { defer func() { close(t.cost) t.cost = nil }() epochChan = t.handleCosts() if epochChan != nil { defer close(epochChan) } } examples := makeExamples(t.trainingSet, t.nn.NNConfig, t.nn.dict, t.ts, t) for e := 0; e < epochs; e++ { if err := t.nn.train(examples); err != nil { return err } if epochChan != nil { epochChan <- struct{}{} } shuffleExamples(examples) } return nil } // crossValidateTrain trains the model but also does cross validation to ensure overfitting don't happen. func (t *Trainer) crossValidateTrain(epochs int) error { if t.perf != nil { defer func() { close(t.perf) t.perf = nil }() } var epochChan chan struct{} if t.cost != nil { defer func() { close(t.cost) t.cost = nil }() epochChan = t.handleCosts() if epochChan != nil { defer close(epochChan) } } examples := makeExamples(t.trainingSet, t.nn.NNConfig, t.nn.dict, t.ts, t) var best Performance for e := 0; e < epochs; e++ { if err := t.nn.train(examples); err != nil { return err } if t.EvalPerIter > 0 && e%t.EvalPerIter == 0 || e == epochs-1 { perf := t.crossValidate(t.crossValSet) // if there is a channel to report back the performance, send it down if t.perf != nil { perf.Iter = e t.perf <- perf } if perf.UAS > best.UAS { best = perf if t.SaveBest != "" { f, err := os.Create(t.SaveBest) if err != nil { err = errors.Wrapf(err, "Unable to open SaveBest file %q", t.SaveBest) return err } t.Model.SaveWriter(f) } } } if epochChan != nil { epochChan <- struct{}{} } shuffleExamples(examples) } return nil } // pretrainCheck checks if everything is sane func (t *Trainer) pretrainCheck() error { // check if t.nn == nil || !t.nn.initialized() { return errors.Errorf("DependencyParser not init()'d. Perhaps you forgot to call .Init() somewhere?") } if len(t.trainingSet) == 0 { return errors.Errorf("Cannot train with no training data set") } return nil } // handleCosts handles the costs from the neural network in two ways: // 1. pass: directly passes on the costs (which may come from multiple batches in an epoch) // 2. mean: calculates the mean of the costs and passes it on into d.cost // // If d.cost is nil, it simply returns. This method should be called after a check that d.cost is not nil func (t *Trainer) handleCosts() (epochChan chan struct{}) { nncost := t.nn.costProgress() if t.PassDirect { go func() { for cost := range nncost { switch c := cost.Data().(type) { case float32: t.cost <- float64(c) case float64: t.cost <- c default: // this should NEVER happen panic(fmt.Sprintf("Unhandled cost type %T", c)) } } }() } else { epochChan = make(chan struct{}) // it collects the costs until the epoch chan signals that an epoch is done. Then the cost is averaged and sent down the d.cost channel go func(epochChan chan struct{}) { var collected []float64 for { select { case cost := <-nncost: switch c := cost.Data().(type) { case float32: collected = append(collected, float64(c)) case float64: collected = append(collected, c) default: // this should NEVER happen panic(fmt.Sprintf("Unhandled cost type %T", c)) } case <-epochChan: var avg float64 for _, cost := range collected { avg += cost } if len(collected) > 0 { avg /= float64(len(collected)) } t.cost <- avg collected = collected[:0] } } }(epochChan) } return } ================================================ FILE: dep/train_test.go ================================================ package dep import ( "testing" "github.com/chewxy/lingo/corpus" G "gorgonia.org/gorgonia" ) func TestTrainerInitializations(t *testing.T) { var d *Trainer c := corpus.New() d = NewTrainer(WithCorpus(c)) if d.corpus != c { t.Errorf("Expected Corpus to be set to %p. Got %p instead", c, d.corpus) } d = NewTrainer(WithConfig(DefaultNNConfig)) if d.corpus != KnownWords { t.Error("Expected corpus to be set to the default KnownWords corpus") } if d.nn == nil { t.Fatal("Expected a neural network") } if d.nn.dict != KnownWords { t.Error("Expected neuralnetwork's dict to be set") } // d2 = d.Clone() // if d2.nn != d.nn { // t.Error("Expected a neural network!") // } // // init empty // d = New() // if err := d.Init(); err != nil { // t.Errorf("%+v", err) // } // // init with a corpus // d = New(WithCorpus(c)) // if err := d.Init(); err != nil { // t.Errorf("%+v", err) // } } func TestTrainer_train(t *testing.T) { sts := allSentences() epochs := 10 var err error trainer := NewTrainer(WithGeneratedCorpus(sts...), WithTrainingSet(sts)) if err = trainer.Train(epochs); err == nil { t.Error("Expected an error when training an uninitialized Trainer") } // with init t.Logf("Pass On Costs Directly") conf := DefaultNNConfig conf.BatchSize = 90 trainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts)) if err := trainer.Init(); err != nil { t.Errorf("%+v", err) } trainer.PassDirect = true var costs []float64 cost := trainer.Cost() go func() { for c := range cost { costs = append(costs, c) t.Logf("Cost %v", c) } }() if err = trainer.Train(epochs); err != nil { t.Errorf("Err: %v", err) } if len(costs) == 0 { t.Errorf("Zero costs...") goto avgcosts } t.Logf("Costs %d", len(costs)) if len(costs) < (epochs*2)-5 { // we'll allow some tolerance t.Errorf("Expected some costs") } if costs[0] < costs[len(costs)-1] { t.Errorf("Costs should be reducing") } avgcosts: // with init, avg costs t.Logf("Average Costs") costs = costs[:0] // reset conf = DefaultNNConfig conf.Dtype = G.Float32 trainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts)) if err := trainer.Init(); err != nil { t.Errorf("%+v", err) } trainer.PassDirect = false cost = trainer.Cost() go func() { for c := range cost { costs = append(costs, c) t.Logf("Cost %v", c) } }() if err = trainer.Train(epochs); err != nil { t.Errorf("%v", err) } if len(costs) == 0 { t.Fatal("Zero costs") } t.Logf("Costs %d", len(costs)) if len(costs) == 0 { t.Errorf("Expected some costs") } if costs[0] < costs[len(costs)-1] { t.Errorf("Costs should be reducing") } } func TestTestTrainer_crossValidateTrain(t *testing.T) { sts := allSentences() cv := cvSentences() epochs := 10 var trainer *Trainer var err error // uninit t.Logf("Uninitiated") trainer = NewTrainer(WithGeneratedCorpus(sts...)) if err = trainer.Train(epochs); err == nil { t.Errorf("Expected an error when training with an uninitialized Trainer") } // with init t.Logf("Pass On Costs Directly") conf := DefaultNNConfig conf.BatchSize = 90 trainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts), WithCrossValidationSet(cv)) trainer.PassDirect = true if err := trainer.Init(); err != nil { t.Errorf("%+v", err) } var costs []float64 cost := trainer.Cost() perf := trainer.Perf() go func() { for p := range perf { t.Logf("Perf \n%v", p) } }() go func() { for c := range cost { costs = append(costs, c) t.Logf("Cost %v", c) } }() if err = trainer.Train(epochs); err != nil { t.Error(err) } if len(costs) == 0 { t.Errorf("Zero costs") goto avgCosts } t.Logf("Costs %d", len(costs)) if len(costs) < (epochs*2)-5 { // we'll allow some tolerance t.Errorf("Expected some costs") } if costs[0] < costs[len(costs)-1] { t.Errorf("Costs should be reducing") } avgCosts: // with init, avg costs, and using float32 t.Logf("Average Costs") costs = costs[:0] // reset conf = DefaultNNConfig conf.Dtype = G.Float32 trainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts), WithCrossValidationSet(cv)) if err := trainer.Init(); err != nil { t.Errorf("%+v", err) } trainer.PassDirect = false cost = trainer.Cost() perf = trainer.Perf() go func() { for p := range perf { t.Logf("Perf \n%v", p) } }() go func() { for c := range cost { costs = append(costs, c) t.Logf("Cost %v", c) } }() trainer.Train(epochs) if len(costs) == 0 { t.Fatal("Zero costs") } t.Logf("Costs %d", len(costs)) if len(costs) == 0 { t.Errorf("Expected some costs") } if costs[0] < costs[len(costs)-1] { t.Errorf("Costs should be reducing") } } ================================================ FILE: dep/transition.go ================================================ package dep import ( "fmt" "github.com/chewxy/lingo" ) // transition is a tuple of Move and label type transition struct { Move lingo.DependencyType } var transitions []transition var MAXTRANSITION int func buildTransitions(labels []lingo.DependencyType) []transition { ts := make([]transition, 0) // for _, l := range labels { // if l == lingo.NoDepType { // continue // } // t := transition{Left, l} // ts = append(ts, t) // } // for _, l := range labels { // if l == lingo.NoDepType { // continue // } // t := transition{Right, l} // ts = append(ts, t) // } // ts = append(ts, transition{Shift, lingo.NoDepType}) for _, m := range ALLMOVES { for _, l := range labels { if (m == Shift && l != lingo.NoDepType) || (m != Shift && l == lingo.NoDepType) { continue } t := transition{m, l} ts = append(ts, t) } } return ts } func (t transition) String() string { return fmt.Sprintf("(%s, %s)", t.Move, t.DependencyType) } func lookupTransition(t transition, table []transition) int { for i, v := range table { if v == t { return i } } panic(fmt.Sprintf("Transition %v not found", t)) } // this builds the default transitions func init() { lbls := make([]lingo.DependencyType, lingo.MAXDEPTYPE) for i := 0; i < int(lingo.MAXDEPTYPE); i++ { lbls[i] = lingo.DependencyType(i) } transitions = buildTransitions(lbls) MAXTRANSITION = len(transitions) } ================================================ FILE: dep/util.go ================================================ package dep func minInt(a, b int) int { if a < b { return a } return b } func maxInt(a, b int) int { if a > b { return a } return b } ================================================ FILE: dependency.go ================================================ package lingo import ( "bytes" "fmt" ) // Dependency represents the dependency parse of a sentence. While AnnotatedSentence does // already do a job of representing the dependency parse of a sentence, *Dependency actually contains // meta information about the dependency parse (specifically, lefts, rights) that makes parsing a dependency a lot faster // // The fields are mostly left unexported for a good reason - a dependency parse SHOULD be static after it's been built type Dependency struct { AnnotatedSentence wordCount int lefts [][]int rights [][]int counter int // for checking if a tree is projective n int } type depConsOpt func(*Dependency) // FromAnnotatedSentence creates a dependency from an AnnotatedSentence. func FromAnnotatedSentence(s AnnotatedSentence) depConsOpt { fn := func(d *Dependency) { wc := len(s) d.AnnotatedSentence = s d.wordCount = wc d.n = wc - 1 } return fn } // AllocTree allocates the lefts and rights. Typical construction of the *Dependency doesn't allocate the trees as they're not necessary for a number of tasks. func AllocTree() depConsOpt { fn := func(d *Dependency) { d.lefts = make([][]int, d.wordCount) d.rights = make([][]int, d.wordCount) for i := 0; i < d.wordCount; i++ { d.lefts[i] = make([]int, 0) d.rights[i] = make([]int, 0) } } return fn } // NewDependency creates a new *Dependency. It takes optional construction options: // FromAnnotatedSentence // AllocTree func NewDependency(opts ...depConsOpt) *Dependency { d := new(Dependency) for _, opt := range opts { opt(d) } return d } func (d *Dependency) Sentence() AnnotatedSentence { return d.AnnotatedSentence } func (d *Dependency) Lefts() [][]int { return d.lefts } func (d *Dependency) Rights() [][]int { return d.rights } func (d *Dependency) WordCount() int { return d.wordCount } func (d *Dependency) N() int { return d.n } // please only use these for testing func (d *Dependency) SetLefts(l [][]int) { d.lefts = l } func (d *Dependency) SetRights(r [][]int) { d.rights = r } func (d *Dependency) Head(i int) int { if i < 0 || i >= d.wordCount || d.AnnotatedSentence[i].Head == nil { return -1 } return d.AnnotatedSentence[i].HeadID() } func (d *Dependency) Label(i int) DependencyType { if i < 0 || i >= d.wordCount { return NoDepType } return d.AnnotatedSentence[i].DependencyType } func (d *Dependency) Annotation(i int) *Annotation { if i < 0 || i >= d.wordCount { return nullAnnotation } return d.AnnotatedSentence[i] } func (d *Dependency) AddArc(head, child int, label DependencyType) { d.AddChild(head, child) d.AddRel(child, label) } func (d *Dependency) AddChild(head, child int) { headAnn := d.AnnotatedSentence[head] d.AnnotatedSentence[child].SetHead(headAnn) if child < head { d.lefts[head] = append(d.lefts[head], child) } else { d.rights[head] = append(d.rights[head], child) } d.n++ } func (d *Dependency) AddRel(child int, rel DependencyType) { // d.labels[child] = rel d.AnnotatedSentence[child].DependencyType = rel } func (d *Dependency) HasSingleRoot() bool { roots := 0 for _, a := range d.AnnotatedSentence { h := a.HeadID() if h == 0 { roots++ } } return roots == 1 } func (d *Dependency) IsLegal() bool { var heads []int for _, a := range d.AnnotatedSentence { h := a.HeadID() if h < 0 || h > d.wordCount { return false } heads = append(heads, -1) } for i := 1; i < d.wordCount; i++ { for k := i; k > 0; { if heads[k] >= 0 && heads[k] < 1 { break } if heads[k] == i { return false } heads[k] = i k = d.AnnotatedSentence[k].HeadID() } } return true } func (d *Dependency) IsProjective() bool { d.counter = -1 return d.projectiveVisit(0) } func (d *Dependency) projectiveVisit(w int) bool { for i := 1; i < w; i++ { if d.AnnotatedSentence[i].HeadID() == w && d.projectiveVisit(i) == false { return false } } d.counter++ if w != d.counter { return false } for i := w + 1; i < d.wordCount; i++ { if d.AnnotatedSentence[i].HeadID() == w && d.projectiveVisit(i) == false { return false } } return true } func (d *Dependency) Root() int { for i := 1; i <= d.n; i++ { if d.Head(i) == 0 { return i } } return 0 } func (d *Dependency) SprintRel() string { var buf bytes.Buffer for _, e := range d.Edges() { fmt.Fprintf(&buf, "%v(%q-%d, %q-%d)\n", e.Rel, e.Gov.Value, e.Gov.ID, e.Dep.Value, e.Dep.ID) } return buf.String() } type DependencyEdge struct { Gov *Annotation Dep *Annotation Rel DependencyType } // Sort interface type edgeByID []DependencyEdge func (b edgeByID) Len() int { return len(b) } func (b edgeByID) Swap(i, j int) { b[i], b[j] = b[j], b[i] } func (b edgeByID) Less(i, j int) bool { return b[i].Dep.ID < b[j].Dep.ID } ================================================ FILE: dependencyTree.go ================================================ package lingo import ( "github.com/awalterschulze/gographviz" "fmt" "sync" ) // A DependencyTree is an alternate form of representing a dependency parse. // This form makes it easier to traverse the tree type DependencyTree struct { Parent *DependencyTree ID int // the word number in a sentence Type DependencyType // refers to the dependency type to the parent Word *Annotation Children []*DependencyTree } func NewDependencyTree(parent *DependencyTree, ID int, ann *Annotation) *DependencyTree { return &DependencyTree{ Parent: parent, ID: ID, Word: ann, Children: make([]*DependencyTree, 0), } } func (d *DependencyTree) AddChild(child *DependencyTree) { d.Children = append(d.Children, child) } func (d *DependencyTree) AddRel(rel DependencyType) { d.Type = rel } func (d *DependencyTree) walk(c chan *DependencyTree, wg *sync.WaitGroup) { defer wg.Done() for _, child := range d.Children { wg.Add(1) go child.walk(c, wg) } c <- d // man someone should do somehting about my bad naming } func (d *DependencyTree) Dot() string { // walk graph c := make(chan *DependencyTree) out := make(chan string) go dotString(c, out) var wg sync.WaitGroup wg.Add(1) go d.walk(c, &wg) wg.Wait() close(c) return <-out } func dotString(c chan *DependencyTree, out chan string) { g := gographviz.NewEscape() g.SetName("G") g.SetDir(true) // it's always going to be a directed graph // g.AddNode("G", "Node_0x0", nil) // add the root for t := range c { id := fmt.Sprintf("Node_%p", t) attrs := map[string]string{ "label": fmt.Sprintf("%d: \"%s/%s\"", t.ID, t.Word.Value, t.Word.POSTag), } g.AddNode("G", id, attrs) if t.Parent == nil { continue } parentID := fmt.Sprintf("Node_%p", t.Parent) edgeAttrs := map[string]string{ "label": fmt.Sprintf("%v", t.Type), } g.AddEdge(parentID, id, true, edgeAttrs) } out <- g.String() } func (d *DependencyTree) Walk(fn func(interface{})) { for _, child := range d.Children { child.Walk(fn) } if fn != nil { fn(d) } } ================================================ FILE: dependencyType.go ================================================ package lingo import ( "fmt" "strings" ) // DependencyType represents the relation between two words type DependencyType byte var dependencyTypeLookup map[string]DependencyType func init() { dependencyTypeLookup = make(map[string]DependencyType) for dt := NoDepType; dt < MAXDEPTYPE; dt++ { s := dt.String() dependencyTypeLookup[s] = DependencyType(dt) dependencyTypeLookup[strings.ToLower(s)] = DependencyType(dt) } } func (dt DependencyType) MarshalText() ([]byte, error) { return []byte(fmt.Sprintf("%v", dt)), nil } func (dt *DependencyType) UnmarshalText(text []byte) error { str := strings.Trim(string(text), `"`) // for JSON use, if any deptype, _ := dependencyTypeLookup[str] *dt = deptype return nil } // list of dependency type functions func InDepTypes(x DependencyType, set []DependencyType) bool { for _, v := range set { if v == x { return true } } return false } func IsModifier(x DependencyType) bool { return InDepTypes(x, Modifiers) } func IsCompound(x DependencyType) bool { return InDepTypes(x, Compounds) } func IsDeterminerRel(x DependencyType) bool { return InDepTypes(x, DeterminerRels) } func IsMultiword(x DependencyType) bool { return InDepTypes(x, MultiWord) } func IsQuantifier(x DependencyType) bool { return InDepTypes(x, QuantifingMods) } ================================================ FILE: dependencyType_stanford.go ================================================ // +build stanfordrel package lingo const BUILD_RELSET = "stanfordrel" //go:generate stringer -type=DependencyType -output=dependencyType_stanford_string.go // http://nlp.stanford.edu/software/dependencies_manual.pdf const ( NoDepType DependencyType = iota Dep Root Aux // Auxilliary AuxPass // passive auxiliary Cop // Copula Arg // argument Agent // agent Comp // Complement AComp // adjectival complement CComp // clausal complement with internal subject XComp // clausal complement with external subject Obj // Object DObj // Direct Object IObj // Indirect Object PObj // Object of preposition Subj // subject NSubj // Nominal Subject NSubjPass // passive nominal subject CSubj // clausal subject CSubjPass // passive clausal subject Coordination // coordination (cannot use CC, as it's a POSTag) Conj // conjunction Expl // Expletive Mod // modifier AMod // adjectival modifier Appos // Appositional modifier Advcl // adverbial clause modifier Det // determiner Predet // predeterminer Preconj // Preconjunction Vmod // reduced, nonfinite verbal modifier MWE // multiword expression modifier Mark // marker (word introducing an Advcl or CComp) AdvMod // adverbial modifier Neg // negation modifier RCMod // relative clause modifier QuantMod // quantifier modifier NounMod // Noun Compound Modifier (cannot use NN because NN is defined as a POSTag) NPAdvMod // Noun phrase adverbial modifier TMod // temporal modifier Num // Numeric Modifier NumberElement // element of compound number (cannot use Number because Number is defined as a LexemeType) Prep // prepositional modifier Poss // possession modifier Possessive // possessive modifier ('s) PRT // phrasal verb partical Parataxis // Parataxis (words that are next to each other) GoesWith // GoesWith Punct // punctuation Ref // referant SDep // Semantic Dependent XSubj // controlling subject // additional stuff not found in the original, but found in EWT Case Compound NMod Discourse NumMod RelCl NFinCl NMod_Poss NMod_NPMod Vocative List MWPrep // multiword prepositional modifier Remnant Acl NPMod MDVod DetMod // found in stanford nnparser SD models PComp MAXDEPTYPE ) var Modifiers = []DependencyType{AMod} var Compounds = []DependencyType{Compound} var DeterminerRels = []DependencyType{Det, DetMod} var MultiWord = []DependencyType{MWE, MWPrep, Compound, Parataxis} var QuantifingMods = []DependencyType{QuantMod, NumMod} ================================================ FILE: dependencyType_stanford_string.go ================================================ // +build stanfordrel // Code generated by "stringer -type=DependencyType -output=dependencyType_stanford_string.go"; DO NOT EDIT package lingo import "fmt" const _DependencyType_name = "NoDepTypeDepRootNSubjNSubjPassDObjIObjCSubjCSubjPassCCompXCompNumModApposNModAClACl_RelClDetDet_PreDetAModNegCaseNMod_NPModNMod_TModNMod_PossAdvClAdvModCompoundCompound_PartMWEListParataxisDiscourseExplAuxAuxPassCopMarkPunctConjCoordinationCC_PreConjMAXDEPTYPE" var _DependencyType_index = [...]uint16{0, 9, 12, 16, 21, 30, 34, 38, 43, 52, 57, 62, 68, 73, 77, 80, 89, 92, 102, 106, 109, 113, 123, 132, 141, 146, 152, 160, 173, 176, 180, 189, 198, 202, 205, 212, 215, 219, 224, 228, 240, 250, 260} func (i DependencyType) String() string { if i >= DependencyType(len(_DependencyType_index)-1) { return fmt.Sprintf("DependencyType(%d)", i) } return _DependencyType_name[_DependencyType_index[i]:_DependencyType_index[i+1]] } ================================================ FILE: dependencyType_universal.go ================================================ // +build !stanfordrel package lingo const BUILD_RELSET = "universalrel" //go:generate stringer -type=DependencyType -output=dependencyType_universal_string.go // http://universaldependencies.github.io/docs/en/dep/all.html const ( NoDepType DependencyType = iota Dep Root // Core dependents of clausal predicates // nominal dependencies NSubj NSubjPass DObj IObj // predicate dependencies CSubj CSubjPass CComp XComp // Noun dependents // nominal dependencies NumMod Appos NMod // predicate dependencies ACl ACl_RelCl // RCMod in stanford deps Det Det_PreDet // modifier word AMod Neg // Case Marking, preposition, possessive Case //Non-Core Dependents of Clausal Predicates // Nominal dependencies NMod_NPMod NMod_TMod NMod_Poss // Predicate Dependencies AdvCl // Modifier Word AdvMod // Compounding and Unanalyzed Compound Compound_Part Name // Unused in English MWE Foreign // Unused in English GoesWith // Unused in English // Loose Joining Relations List Dislocated // Unused in English Parataxis Remnant // Unused in English Reparandum // Unused in English // Special Clausal Dependents // Nominal Dependent Vocative // Unused in English Discourse Expl // Auxilliary Aux AuxPass Cop // Other Mark Punct // Coordination Conj Coordination // CC CC_PreConj MAXDEPTYPE ) var Modifiers = []DependencyType{AMod} var Compounds = []DependencyType{Compound, Compound_Part} var DeterminerRels = []DependencyType{Det, Det_PreDet} var MultiWord = []DependencyType{MWE, Compound, Compound_Part, Parataxis} var QuantifingMods = []DependencyType{NumMod} ================================================ FILE: dependencyType_universal_string.go ================================================ // +build !stanfordrel // Code generated by "stringer -type=DependencyType -output=dependencyType_universal_string.go"; DO NOT EDIT package lingo import "fmt" const _DependencyType_name = "NoDepTypeDepRootNSubjNSubjPassDObjIObjCSubjCSubjPassCCompXCompNumModApposNModAClACl_RelClDetDet_PreDetAModNegCaseNMod_NPModNMod_TModNMod_PossAdvClAdvModCompoundCompound_PartNameMWEForeignGoesWithListDislocatedParataxisRemnantReparandumVocativeDiscourseExplAuxAuxPassCopMarkPunctConjCoordinationCC_PreConjMAXDEPTYPE" var _DependencyType_index = [...]uint16{0, 9, 12, 16, 21, 30, 34, 38, 43, 52, 57, 62, 68, 73, 77, 80, 89, 92, 102, 106, 109, 113, 123, 132, 141, 146, 152, 160, 173, 177, 180, 187, 195, 199, 209, 218, 225, 235, 243, 252, 256, 259, 266, 269, 273, 278, 282, 294, 304, 314} func (i DependencyType) String() string { if i >= DependencyType(len(_DependencyType_index)-1) { return fmt.Sprintf("DependencyType(%d)", i) } return _DependencyType_name[_DependencyType_index[i]:_DependencyType_index[i+1]] } ================================================ FILE: errors.go ================================================ package lingo type componentUnavailable interface { error Component() string } ================================================ FILE: go.mod ================================================ module github.com/chewxy/lingo require ( github.com/abiosoft/ishell v2.0.0+incompatible github.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db // indirect github.com/awalterschulze/gographviz v0.0.0-20190221210632-1e9ccb565bca github.com/chewxy/hm v1.0.0 // indirect github.com/chewxy/math32 v1.0.0 // indirect github.com/chzyer/logex v1.1.10 // indirect github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/fatih/color v1.7.0 // indirect github.com/flynn-archive/go-shlex v0.0.0-20150515145356-3f9db97f8568 // indirect github.com/gogo/protobuf v1.2.1 // indirect github.com/golang/protobuf v1.2.0 // indirect github.com/google/flatbuffers v1.10.0 // indirect github.com/kljensen/snowball v0.6.0 github.com/leesper/go_rng v0.0.0-20171009123644-5344a9259b21 // indirect github.com/mattn/go-colorable v0.1.1 // indirect github.com/mattn/go-isatty v0.0.6 // indirect github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4 github.com/pkg/errors v0.8.1 github.com/stretchr/testify v1.3.0 github.com/xtgo/set v1.0.0 golang.org/x/exp v0.0.0-20190221220918-438050ddec5e // indirect golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 // indirect golang.org/x/sys v0.0.0-20190225065934-cc5685c2db12 // indirect golang.org/x/text v0.3.0 gonum.org/v1/gonum v0.0.0-20190221132855-8ea67971a689 // indirect gonum.org/v1/netlib v0.0.0-20190221094214-0632e2ebbd2d // indirect gorgonia.org/cu v0.9.0-beta // indirect gorgonia.org/dawson v1.1.0 // indirect gorgonia.org/gorgonia v0.9.1 gorgonia.org/tensor v0.9.0-beta gorgonia.org/vecf32 v0.7.0 // indirect gorgonia.org/vecf64 v0.7.0 // indirect ) go 1.13 ================================================ FILE: go.sum ================================================ github.com/abiosoft/ishell v2.0.0+incompatible/go.mod h1:HQR9AqF2R3P4XXpMpI0NAzgHf/aS6+zVXRj14cVk9qg= github.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db/go.mod h1:rB3B4rKii8V21ydCbIzH5hZiCQE7f5E9SzUb/ZZx530= github.com/awalterschulze/gographviz v0.0.0-20190221210632-1e9ccb565bca h1:xwIXr1FpA2XBoohlpvgb11No/zbsh5Clm/98PWPcHVA= github.com/awalterschulze/gographviz v0.0.0-20190221210632-1e9ccb565bca/go.mod h1:GEV5wmg4YquNw7v1kkyoX9etIk8yVmXj+AkDHuuETHs= github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k= github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0= github.com/chewxy/math32 v1.0.0 h1:RTt2SACA7BTzvbsAKVQJLZpV6zY2MZw4bW9L2HEKkHg= github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0= github.com/chzyer/logex v1.1.10 h1:Swpa1K6QvQznwJRcfTfQJmTE72DqScAa40E+fbHEXEE= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 h1:q763qf9huN11kDQavWsoZXJNW3xEE4JJyHa5Q25/sd8= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/flynn-archive/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:rZfgFAXFS/z/lEd6LJmf9HVZ1LkgYiHx5pHhV5DR16M= github.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/google/flatbuffers v1.10.0 h1:wHCM5N1xsJ3VwePcIpVqnmjAqRXlR44gv4hpGi+/LIw= github.com/google/flatbuffers v1.10.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kljensen/snowball v0.6.0/go.mod h1:27N7E8fVU5H68RlUmnWwZCfxgt4POBJfENGMvNRhldw= github.com/leesper/go_rng v0.0.0-20171009123644-5344a9259b21 h1:O75p5GUdUfhJqNCMM1ntthjtJCOHVa1lzMSfh5Qsa0Y= github.com/leesper/go_rng v0.0.0-20171009123644-5344a9259b21/go.mod h1:N0SVk0uhy+E1PZ3C9ctsPRlvOPAFPkCNlcPBDkt0N3U= github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.6/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4/go.mod h1:4OwLy04Bl9Ef3GJJCoec+30X3LQs/0/m4HFRt/2LUSA= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY= github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8= golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190221220918-438050ddec5e h1:dVreTP5bOOWt5GFwwvgTE2iU0TkIqi2x3r0b8qGlp6k= golang.org/x/exp v0.0.0-20190221220918-438050ddec5e/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 h1:YUO/7uOKsKeq9UokNS62b8FYywz3ker1l1vDZRCRefw= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190225065934-cc5685c2db12 h1:Zw7eRv6INHGfu15LVRN1vrrwusJbnfJjAZn3D1VkQIE= golang.org/x/sys v0.0.0-20190225065934-cc5685c2db12/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= gonum.org/v1/gonum v0.0.0-20190221132855-8ea67971a689 h1:C+7Si2b5qgXShERPqwtDu36i1o1yf1VM93A3GZIe9Fk= gonum.org/v1/gonum v0.0.0-20190221132855-8ea67971a689/go.mod h1:jevfED4GnIEnJrWW55YmY9DMhajHcnkqVnEXmEtMyNI= gonum.org/v1/netlib v0.0.0-20190221094214-0632e2ebbd2d h1:m4zHh49Vwhwq5n7qC7NRl5SqRfTyT/6PP2ASGNGRB1E= gonum.org/v1/netlib v0.0.0-20190221094214-0632e2ebbd2d/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gorgonia.org/cu v0.9.0-beta h1:s4WQ6fiAGoErwIiXWHRB6Y9ydkx1vTTPwhWzoEZVePc= gorgonia.org/cu v0.9.0-beta/go.mod h1:RPEPIfaxxqUmeRe7T1T8a0NER+KxBI2McoLEXhP1Vd8= gorgonia.org/dawson v1.1.0 h1:o7+eJ3SKi9sheH19lpOat//tDbg0Y+M9iY/lH79VHqY= gorgonia.org/dawson v1.1.0/go.mod h1:Px1mcziba8YUBIDsbzGwbKJ11uIblv/zkln4jNrZ9Ws= gorgonia.org/gorgonia v0.9.1 h1:6blWHSDHCplQHem+pvo9dZvtsQp7l3ZiVqXk26frn9M= gorgonia.org/gorgonia v0.9.1/go.mod h1:qucT7YHm/2OuSHWEw/6Je/LQ5htRJNQJ1+qpB58fY8c= gorgonia.org/tensor v0.9.0-beta h1:16QQufB1vbJxVbIOaB5TwkerdlBWtw+AAnZHUZ531ZE= gorgonia.org/tensor v0.9.0-beta/go.mod h1:05Y4laKuVlj4qFoZIZW1q/9n1jZkgDBOLmKXZdBLG1w= gorgonia.org/vecf32 v0.7.0 h1:mkpVzSyT7/Cput5/ZxaMzzp2xbmOtqOyJlTf7AdSMe0= gorgonia.org/vecf32 v0.7.0/go.mod h1:iHG+kvTMqGYA0SgahfO2k62WRnxmHsqAREGbayRDzy8= gorgonia.org/vecf64 v0.7.0 h1:ZphOGJfnWlFfY7x8WAJAfO64IAtYqPPq9TEGem+ItZE= gorgonia.org/vecf64 v0.7.0/go.mod h1:1y4pmcSd+wh3phG+InwWQjYrqwyrtN9h27WLFVQfV1Q= ================================================ FILE: interfaces.go ================================================ package lingo import ( "encoding/gob" "gorgonia.org/tensor" ) // Lemmatizer is anything that can lemmatize type Lemmatizer interface { Lemmatize(string, POSTag) ([]string, error) } // Stemmer is anything that can stem type Stemmer interface { Stem(string) (string, error) } // Sentencer is anything that returns an AnnotatedSentence type Sentencer interface { Sentence() AnnotatedSentence } // Corpus is the interface for the corpus. type Corpus interface { // ID returns the ID of a word and whether or not it was found in the corpus Id(word string) (id int, ok bool) // Word returns the word given the ID, and whether or not it was found in the corpus Word(id int) (word string, ok bool) // Add adds a word to the corpus and returns its ID. If a word was previously in the corpus, it merely updates the frequency count and returns the ID Add(word string) int // Size returns the size of the corpus. Size() int // WordFreq returns the frequency of the word. If the word wasn't in the corpus, it returns 0. WordFreq(word string) int // IDFreq returns the frequency of a word given an ID. If the word isn't in the corpus it returns 0. IDFreq(id int) int // TotalFreq returns the total number of words ever seen by the corpus. This number includes the count of repeat words. TotalFreq() int // MaxWordLength returns the length of the longest known word in the corpus MaxWordLength() int // WordProb returns the probability of a word appearing in the corpus WordProb(word string) (float64, bool) // IO stuff gob.GobEncoder gob.GobDecoder } // WordEmbeddings is any type that is both a corpus and can return word vectors type WordEmbeddings interface { Corpus // WordVector returns a vector of embeddings given the word WordVector(word string) (vec tensor.Tensor, err error) // Vector returns a vector of embeddings given the word ID Vector(id int) (vec tensor.Tensor, err error) // Embedding returns the matrix Embedding() tensor.Tensor } ================================================ FILE: io.go ================================================ package lingo import ( "bytes" "encoding/json" "fmt" "strings" "github.com/pkg/errors" ) type dummyAnnotation struct { POSTag `json:"POSTag"` DependencyType `json:"Label"` ID int `json:"ID"` Head int `json:"Head"` Value string `json:"Value"` Lemma string `json:"Lemma"` Stem string `json:"Stem"` Cluster `json:"Cluster"` Shape `json:"Shape"` WordFlag `json:"WordFlat"` } // func (a *Annotation) MarshalText() ([]byte, error) { // var buf bytes.Buffer // if a.Head != nil { // fmt.Fprintf(&buf, "%v(%q/%v-%d, %q/%v-%d)", a.DependencyType, a.Value, a.POSTag, a.ID, a.Head.Value, a.Head.POSTag, a.Head.ID) // } else if a == rootAnnotation { // fmt.Fprintf(&buf, "ROOT") // } else { // fmt.Fprintf(&buf, "%q/%v-%d", a.Value, a.POSTag, a.ID) // } // return buf.Bytes(), nil // } func (a *Annotation) MarshalJSON() ([]byte, error) { var buf bytes.Buffer buf.WriteRune('{') fmt.Fprintf(&buf, "\"ID\": %d,", a.ID) fmt.Fprintf(&buf, "\"Value\": %q,", a.Value) fmt.Fprintf(&buf, "\"POSTag\": \"%v\",", a.POSTag) fmt.Fprintf(&buf, "\"Label\": \"%v\"", a.DependencyType) if a.Head != nil { if a.Head == rootAnnotation { fmt.Fprintf(&buf, ", \"Head\": -1000") // special signifier for root annotations } else { fmt.Fprintf(&buf, ", \"Head\": %d", a.HeadID()) } } if a.Lemma != "" { fmt.Fprintf(&buf, ", \"Lemma\": %q", a.Lemma) } // Lowered is not serialized because it's a simple function call away if a.Stem != "" { fmt.Fprintf(&buf, ",\"Stem\": %q", a.Stem) } if a.Cluster > 0 { fmt.Fprintf(&buf, ",\"Cluster\": %d", a.Cluster) } if a.Shape != "" { fmt.Fprintf(&buf, ",\"Shape\": %q", a.Shape) } if a.WordFlag > 0 { fmt.Fprintf(&buf, ",\"WordFlag\": %d", a.WordFlag) } buf.WriteRune('}') return buf.Bytes(), nil } func (a *Annotation) UnmarshalJSON(b []byte) error { if a == nil { // error return errors.Errorf("Cannot unmarshal json to a nul") } d := dummyAnnotation{} if err := json.Unmarshal(b, &d); err != nil { return err } a.Value = d.Value a.POSTag = d.POSTag a.DependencyType = d.DependencyType a.ID = d.ID a.Lemma = d.Lemma a.Stem = d.Stem a.Cluster = d.Cluster a.Shape = d.Shape a.WordFlag = d.WordFlag return nil } func (as AnnotatedSentence) MarshalJSON() ([]byte, error) { buf := new(bytes.Buffer) encoder := json.NewEncoder(buf) buf.WriteRune('[') for i, a := range as { if err := encoder.Encode(a); err != nil { return nil, err } if i < len(as)-1 { buf.WriteRune(',') } } buf.WriteRune(']') return buf.Bytes(), nil } func (as *AnnotatedSentence) UnmarshalJSON(b []byte) error { dummies := make([]dummyAnnotation, 0) if err := json.Unmarshal(b, &dummies); err != nil { return err } asL := len(*as) l := len(dummies) if asL != l { diff := l - asL (*as) = append(*as, make(AnnotatedSentence, diff)...) } for i, d := range dummies { a := (*as)[i] if d.Value == "-ROOT-" { (*as)[i] = rootAnnotation continue } if a == nil { a = new(Annotation) } a.Value = d.Value a.POSTag = d.POSTag a.DependencyType = d.DependencyType a.ID = d.ID a.Lemma = d.Lemma a.Stem = d.Stem a.Cluster = d.Cluster a.Shape = d.Shape a.WordFlag = d.WordFlag (*as)[i] = a } // fix up head IDs for i, d := range dummies { a := (*as)[i] head := d.Head if head == -1000 { a.SetHead(rootAnnotation) } else { a.SetHead((*as)[head]) } } // TODO: fix up other things for _, a := range *as { a.Lowered = strings.ToLower(a.Value) } return nil } ================================================ FILE: io_test.go ================================================ package lingo import ( "encoding/json" "testing" ) func TestAnnotationJSON(t *testing.T) { a := NewAnnotation() a.Value = "hello" a.POSTag = NOUN a.DependencyType = Aux a.ID = 2 b, err := json.Marshal(a) if err != nil { t.Error(err) } t.Logf(" %s", string(b)) x := `{"ID":2,"Value":"hello","POSTag":"NOUN","Label":"Aux"}` c := NewAnnotation() if err = json.Unmarshal([]byte(x), c); err != nil { t.Error(err) } if c.Value != a.Value { t.Errorf("Expected Value to be %q. Got %q insteed", a.Value, c.Value) } if c.POSTag != a.POSTag { t.Errorf("Expected POSTag to be %v. Got %v instead", a.POSTag, c.POSTag) } if c.DependencyType != a.DependencyType { t.Errorf("Expected DependencyType to be %v. Got %v instead", a.DependencyType, c.DependencyType) } } func TestAnnotatedSentenceJSON(t *testing.T) { a := NewAnnotation() a.Value = "hello" a.POSTag = NOUN a.DependencyType = Aux a.ID = 0 b := NewAnnotation() b.Value = "world" b.POSTag = NOUN b.DependencyType = Aux b.ID = 1 b.Head = rootAnnotation a.Head = b as := AnnotatedSentence{a, b} bs, err := json.Marshal(as) if err != nil { t.Fatal(err) } t.Logf("%s", string(bs)) x := `[{"ID":0,"Value":"hello","POSTag":"NOUN","Label":"Aux","Head":1},{"ID":1,"Value":"world","POSTag":"NOUN","Label":"Aux","Head":-1000}]` var cs AnnotatedSentence if err = json.Unmarshal([]byte(x), &cs); err != nil { t.Error(err) } t.Logf("%v", cs) for i, c := range cs { d := as[i] if c.Value != d.Value { t.Error("Expected Values to be the same") } if c.POSTag != d.POSTag { t.Error("POSTag not the same") } if c.DependencyType != d.DependencyType { t.Error("Dependency Types not the same") } if c.HeadID() != d.HeadID() { t.Errorf("%v HeadIDs not the same. Want %v, got %v instead", d, d.HeadID(), c.HeadID()) } } } ================================================ FILE: lexeme.go ================================================ package lingo import ( "fmt" "unicode" ) //go:generate stringer -type=LexemeType type LexemeType byte const ( EOF LexemeType = iota Word Disambig URI Number Date Time Punctuation Symbol Space SystemUse ) type Lexeme struct { Value string LexemeType LexemeType Line int Col int Pos int } func MakeLexeme(s string, t LexemeType) Lexeme { return Lexeme{ Value: s, LexemeType: t, Line: -1, Col: -1, Pos: -1, } } func (l Lexeme) Fix() Lexeme { if StringIs(l.Value, unicode.IsDigit) { l.LexemeType = Number return l } return l } func (l Lexeme) String() string { switch l.LexemeType { case EOF: return "EOF" default: return fmt.Sprintf("%q/%v", l.Value, l.LexemeType) } } func (l Lexeme) GoString() string { switch l.LexemeType { case EOF: return fmt.Sprintf("EOF: %q (%d, %d, %d)", l.Value, l.Line, l.Col, l.Pos) default: return fmt.Sprintf("%s: %q (%d, %d, %d)", l.LexemeType, l.Value, l.Line, l.Col, l.Pos) } } var startLexeme = MakeLexeme("START_LEXEME", SystemUse) var rootLexeme = MakeLexeme("-ROOT-", SystemUse) var nullLexeme = MakeLexeme("", SystemUse) func StartLexeme() Lexeme { return startLexeme } func RootLexeme() Lexeme { return rootLexeme } func NullLexeme() Lexeme { return nullLexeme } ================================================ FILE: lexemetype_string.go ================================================ // Code generated by "stringer -type=LexemeType"; DO NOT EDIT package lingo import "fmt" const _LexemeType_name = "EOFWordDisambigURINumberDateTimePunctuationSymbolSpaceSystemUse" var _LexemeType_index = [...]uint8{0, 3, 7, 15, 18, 24, 28, 32, 43, 49, 54, 63} func (i LexemeType) String() string { if i >= LexemeType(len(_LexemeType_index)-1) { return fmt.Sprintf("LexemeType(%d)", i) } return _LexemeType_name[_LexemeType_index[i]:_LexemeType_index[i+1]] } ================================================ FILE: lexer/lexer.go ================================================ package lexer import ( "bufio" "bytes" "io" "strings" "sync" "golang.org/x/text/unicode/norm" "github.com/chewxy/lingo" ) const eof rune = -1 type Lexer struct { name string input *bufio.Reader state stateFn r rune width int pos int start int line int col int // the string we're reading buf *bytes.Buffer Output chan lingo.Lexeme Errors chan error sync.Mutex } func New(name string, r io.Reader) *Lexer { return &Lexer{ name: name, input: bufio.NewReader(r), width: 1, start: 1, // for userfriendliness, the column index starts at 1 col: 1, pos: 1, buf: new(bytes.Buffer), Output: make(chan lingo.Lexeme), Errors: make(chan error), } } func (l *Lexer) Run() { l.Lock() defer l.Unlock() defer close(l.Output) for state := lexText; state != nil; { state = state(l) } } // Reset resets the buffers. It creates a new Output and Error channel func (l *Lexer) Reset(r io.Reader) { l.Lock() l.input.Reset(r) l.buf.Reset() l.Output = make(chan lingo.Lexeme) l.Errors = make(chan error) l.Unlock() } func (l *Lexer) next() rune { var err error l.r, l.width, err = l.input.ReadRune() if err == io.EOF { l.width = 1 return eof } l.col += l.width l.pos += l.width return l.r } // nextUntilEOF will loop until it finds the matching string OR EOF func (l *Lexer) nextUntilEOF(s string) bool { for r := l.next(); r != eof && strings.IndexRune(s, r) < 0; r = l.next() { // l.next() l.accept() } if l.r == eof { return true } return false } func (l *Lexer) backup() { l.input.UnreadRune() l.pos -= l.width l.col -= l.width } func (l *Lexer) peek() rune { backup := l.r pos := l.pos col := l.col r := l.next() l.backup() l.pos = pos l.col = col l.r = backup return r } func (l *Lexer) lineCount() { newLines := bytes.Count(l.buf.Bytes(), []byte("\n")) l.line += newLines if newLines > 0 { l.col = 1 } } func (l *Lexer) accept() { l.buf.WriteRune(l.r) } func (l *Lexer) acceptRun(valid string) (accepted bool) { for strings.IndexRune(valid, l.peek()) >= 0 { l.next() l.accept() accepted = true } return } func (l *Lexer) acceptRunFn(fn func(rune) bool) (accepted int) { for fn(l.peek()) { l.next() l.accept() accepted++ } return } func (l *Lexer) ignore() { l.start = l.pos l.buf.Reset() } func (l *Lexer) emit(t lingo.LexemeType) { normalized := string(norm.NFC.Bytes(l.buf.Bytes())) lex := lingo.MakeLexeme(normalized, t) lex.Line = l.line lex.Col = l.start lex.Pos = l.pos - l.buf.Len() // TODO: sometimes the offset is wrong on leading tokens since l.pos starts at 1 // if lex.Pos < 0 { // lex.Pos = 0 // } l.Output <- lex // reset l.ignore() if l.r != 0x0 { l.buf.WriteRune(l.r) } } ================================================ FILE: lexer/lexer_test.go ================================================ package lexer import ( "strings" "testing" "github.com/chewxy/lingo" ) type lexerTest struct { name string s string lexemes []lingo.Lexeme } var lexerTests = []lexerTest{ // {"empty", "", []lingo.Lexeme{ // {"", lingo.EOF, 0, 1, 0}, // }}, // // {"spaces", " \t", []lingo.Lexeme{ // {"", lingo.EOF, 0, 3, 2}, // }}, // // {"newlines", "\n\r\n\n", []lingo.Lexeme{ // {"", lingo.EOF, 3, 5, 4}, // }}, // // {"simple text", "hello world", []lingo.Lexeme{ // {"hello", lingo.Word, 0, 1, 0}, // {"world", lingo.Word, 0, 7, 6}, // {"", lingo.EOF, 0, 12, 11}, // }}, // // {"simple number", "3.1415", []lingo.Lexeme{ // {"3.1415", lingo.Number, 0, 1, 0}, // {"", lingo.EOF, 0, 12, 5}, // }}, {"advanced numerology", "3.14 -1.618", []lingo.Lexeme{ {"3.14", lingo.Number, 0, 1, 0}, {"-1.618", lingo.Number, 0, 6, 5}, {"", lingo.EOF, 0, 11, 10}, }}, // {"advanced numerology", "3.14 -1.618 6.023e23 1e-13", []lingo.Lexeme{ // {"3.14", lingo.Number, 0, 1, 0}, // {"-1.618", lingo.Number, 0, 6, 5}, // {"6.023e23", lingo.Number, 0, 13, 12}, // {"1e-13", lingo.Number, 0, 21, 20}, // {"", lingo.EOF, 0, 26, 25}, // }}, // // {"esoteric numerology", "1/2 1 1/4", []lingo.Lexeme{ // {"1/2", lingo.Number, 0, 1, 0}, // {"1", lingo.Number, 0, 5, 4}, // {"1/4", lingo.Number, 0, 7, 6}, // {"", lingo.EOF, 0, 10, 9}, // }}, // // {"text with numbers", "one plus 1 don't equals 3", []lingo.Lexeme{ // {"one", lingo.Word, 0, 1, 0}, // {"plus", lingo.Word, 0, 5, 4}, // {"1", lingo.Number, 0, 10, 9}, // {"do", lingo.Word, 0, 12, 11}, // {"n't", lingo.Word, 0, 14, 13}, // {"equals", lingo.Word, 0, 18, 17}, // {"3", lingo.Number, 0, 24, 23}, // {"", lingo.EOF, 0, 25, 24}, // }}, // // {"text with numbers + punct", "First111!.!", []lingo.Lexeme{ // {"First111", lingo.Word, 0, 1, 0}, // {"!.!", lingo.Punctuation, 0, 9, 8}, // {"", lingo.EOF, 0, 10, 9}, // }}, // // {"text with verb contractions", "You're panic'd I'll get'em I've", []lingo.Lexeme{ // {"You", lingo.Word, 0, 1, 0}, // {"'re", lingo.Word, 0, 3, 2}, // {"panic", lingo.Word, 0, 8, 7}, // {"'d", lingo.Word, 0, 13, 12}, // {"I", lingo.Word, 0, 16, 15}, // {"'ll", lingo.Word, 0, 17, 16}, // {"get", lingo.Word, 0, 21, 20}, // {"'em", lingo.Word, 0, 24, 23}, // {"I", lingo.Word, 0, 27, 26}, // {"'ve", lingo.Word, 0, 30, 29}, // {"", lingo.EOF, 0, 33, 32}, // }}, // // {"email", "dont@email.this", []lingo.Lexeme{ // {"dont@email.this", lingo.Word, 0, 1}, // {"", lingo.EOF, 0, 10}, // }}, // // {"plain dashes should not be numbers", "this case - like so", []lingo.Lexeme{ // {"this", lingo.Word, 0, 1}, // {"case", lingo.Word, 0, 5}, // {"-", lingo.Punctuation, 0, 6}, // {"like", lingo.Word, 0, 8}, // {"so", lingo.Word, 0, 13}, // {"", lingo.EOF, 0, 14}, // }}, // // {"parens should be printed", "like (this)", []lingo.Lexeme{ // {"like", lingo.Word, 0, 1}, // {"(", lingo.Punctuation, 0, 5}, // {"this", lingo.Word, 0, 6}, // {")", lingo.Punctuation, 0, 10}, // {"", lingo.EOF, 0, 11}, // }}, // // {"parenthesis should be considered separate", "USA(United States of America)", []lingo.Lexeme{ // {"USA", lingo.Word, 0, 1}, // {"(", lingo.Punctuation, 0, 1}, // {"United", lingo.Word, 0, 1}, // {"States", lingo.Word, 0, 1}, // {"of", lingo.Word, 0, 1}, // {"America", lingo.Word, 0, 1}, // {")", lingo.Punctuation, 0, 1}, // {"", lingo.EOF, 0, 0}, // }}, // // {"midstream puncuation", "like:this", []lingo.Lexeme{ // {"like", lingo.Word, 0, 1}, // {":", lingo.Punctuation, 0, 5}, // {"this", lingo.Word, 0, 6}, // {"", lingo.EOF, 0, 7}, // }}, // // {"midstream symbols", "e-meet ke$ha by e-mail $ell anti-inflammatory", []lingo.Lexeme{ // {"e-meet", lingo.Word, 0, 1}, // {"ke$ha", lingo.Word, 0, 1}, // {"by", lingo.Word, 0, 1}, // {"e-mail", lingo.Word, 0, 1}, // {"$", lingo.Symbol, 0, 1}, // {"ell", lingo.Word, 0, 1}, // {"anti-inflammatory", lingo.Word, 0, 1}, // {"", lingo.EOF, 0, 0}, // }}, // // {"abbrev", "USB, made in U.S.A. e.g t/away c/o", []lingo.Lexeme{ // {"USB", lingo.Word, 0, 1}, // {",", lingo.Punctuation, 0, 4}, // {"made", lingo.Word, 0, 6}, // {"in", lingo.Word, 0, 11}, // {"U.S.A", lingo.Word, 0, 14}, // {".", lingo.Punctuation, 0, 19}, // {"e.g", lingo.Word, 0, 0}, // {"t/away", lingo.Word, 0, 0}, // {"c/o", lingo.Word, 0, 0}, // {"", lingo.EOF, 0, 20}, // }}, // // {"date time", "1970/1/1 00:00 00:00:00", []lingo.Lexeme{ // {"1970/1/1", lingo.Date, 0, 1}, // {"00:00", lingo.Time, 0, 1}, // {"00:00:00", lingo.Time, 0, 20}, // {"", lingo.EOF, 0, 20}, // }}, // // {"date time with dashes", "31-12-1970", []lingo.Lexeme{ // {"31/12/1970", lingo.Date, 0, 1}, // {"", lingo.EOF, 0, 11}, // }}, // // {"URI", "wobsite: http://www.wobsite.something.this/is/still/a.url", []lingo.Lexeme{ // {"wobsite", lingo.Word, 0, 1}, // {":", lingo.Punctuation, 0, 8}, // {"http://www.wobsite.something.this/is/still/a.url", lingo.URI, 0, 10}, // {"", lingo.EOF, 0, 20}, // }}, // // {"proper sentence", "hello world.", []lingo.Lexeme{ // {"hello", lingo.Word, 0, 1}, // {"world", lingo.Word, 0, 6}, // {".", lingo.Punctuation, 0, 7}, // {"", lingo.EOF, 0, 8}, // }}, // // // Naive and Cafe uses combination diacritics, while the rest are just unicode // // The lexer should normalize all the things // {"pathological english words", "Façade à la Naïve Château Café", []lingo.Lexeme{ // {"Façade", lingo.Word, 0, 1}, // {"à", lingo.Word, 0, 1}, // {"la", lingo.Word, 0, 1}, // {"Naïve", lingo.Word, 0, 1}, // {"Château", lingo.Word, 0, 1}, // {"Café", lingo.Word, 0, 1}, // {"", lingo.EOF, 0, 0}, // }}, // // // just plain fucked // {"jpf", "你好 العالم", []lingo.Lexeme{ // {"你好", lingo.Word, 0, 1}, // {"العالم", lingo.Word, 0, 1}, // {"", lingo.EOF, 0, 0}, // }}, } func testLexer(lts *lexerTest) []lingo.Lexeme { l := New(lts.name, strings.NewReader(lts.s)) var retVal []lingo.Lexeme go l.Run() for lex := range l.Output { retVal = append(retVal, lex) } return retVal } func TestLexer(t *testing.T) { for _, lts := range lexerTests { lexemes := testLexer(<s) if len(lts.lexemes) != len(lexemes) { t.Errorf("Test %q: Expected %d lexemes. Got %d instead: %v", lts.name, len(lts.lexemes), len(lexemes), lexemes) continue } for i, lex := range lexemes { if lex.LexemeType != lts.lexemes[i].LexemeType || lex.Value != lts.lexemes[i].Value || lts.lexemes[i].Pos != lex.Pos { t.Errorf("Test %q, lexeme %d: Expected %#v. Got %#v instead", lts.name, i, lts.lexemes[i], lex) } } } } ================================================ FILE: lexer/stateFn.go ================================================ package lexer import ( "unicode" "github.com/chewxy/lingo" ) type stateFn func(*Lexer) stateFn func lexText(l *Lexer) (fn stateFn) { for { next := l.next() if next == eof { break } if l.pos != l.start { switch { case unicode.IsSpace(next): l.backup() fn = lexWhitespace case unicode.IsDigit(next): // if the position is start +1. // This means that the first char of the string to be lexed is a number // this prevents things like "yay1111" to be lexed as "yay" and "1111" if l.pos == l.start+1 { l.backup() return lexNumber } case next == ':': // possible URI if l.peek() == '/' { l.accept() // accept ':' l.next() if l.peek() == '/' { l.accept() return lexURI } // otherwise... l.backup() // "unaccept". since '/' has a width of 1 we can do the following l.buf.Truncate(l.buf.Len() - 1) } fn = lexPunctuation case unicode.IsPunct(next): // For things like "u.s" or "i.e." or "e.g." n := l.peek() switch { case next == '\'': if unicode.IsLetter(n) { l.emit(lingo.Word) return lexText } case n == eof: // common scenario - where a punctuation ends the sentence, and this thing is unable to backup l.width = 1 l.backup() l.width = 0 fn = lexPunctuation goto finishup // goto because there are other cases below case unicode.IsLetter(n) && (next == '.' || next == '@' || next == '-' || next == '/'): // acceptable midstream punctuations in words are emails and abbreviations l.accept() return lexText default: // it's definitely a punctuation l.backup() fn = lexPunctuation } case unicode.IsSymbol(next): // for things like "ke$ha" // bear in mind that "$ell" will be split into two lexemes. n := l.peek() if unicode.IsLetter(n) { l.backup() l.accept() return lexText } //l.backup() fn = lexSymbol case next == 'n': // for things like "don't" or "doesn't" n := l.peek() if n == '\'' { l.backup() l.emit(lingo.Word) return lexPunctuation } else { l.accept() // accept n return lexText } } } finishup: if fn != nil { if l.start != l.pos { l.emit(lingo.Word) } return fn } // otherwise keep lexText l.accept() } if l.pos > l.start { l.emit(lingo.Word) } l.emit(lingo.EOF) return nil } // lexNumber lexes numbers. It accepts runs of unicode digits. // Upon stopping, it checks to see if the next value is a '.'. If it is, then it's a decimal value, and continues a run // Upon stopping a second time, it checks for 'e' or 'E', for exponentiation - 1.2E2 func lexNumber(l *Lexer) (fn stateFn) { l.acceptRunFn(unicode.IsDigit) next := l.next() switch next { case '.': l.accept() // accept the dot l.acceptRunFn(unicode.IsDigit) case '-', '/': // standardize l.r = '/' l.accept() return lexDate case ':': if l.pos-l.start == 3 { l.accept() return lexTime } else { l.backup() l.emit(lingo.Number) return lexPunctuation } default: l.backup() } if l.acceptRun("eE") { // handle negative exponents if l.peek() == '-' { l.next() l.accept() return lexNumber(l) } l.acceptRunFn(unicode.IsDigit) } l.backup() if l.buf.Len() == 1 && l.buf.Bytes()[0] == '-' { l.emit(lingo.Punctuation) // dash return lexWhitespace } l.emit(lingo.Number) return lexWhitespace } func lexWhitespace(l *Lexer) (fn stateFn) { l.acceptRunFn(unicode.IsSpace) l.lineCount() // l.incrementLineCount() // l.backup() l.ignore() //nothing will be emitted next := l.peek() switch { case unicode.IsDigit(next): return lexNumber case unicode.IsPunct(next): if next == '-' { l.next() l.accept() return lexNumber } return lexPunctuation case unicode.IsSymbol(next): return lexSymbol } return lexText } func lexPunctuation(l *Lexer) (fn stateFn) { next := l.next() switch next { case '\'': l.accept() n := l.peek() switch n { case 't', 's', 'm', 'd': l.next() l.accept() // accept 't'/'s'... l.emit(lingo.Word) return lexWhitespace } case '.': l.accept() // for cases such as "U.S" or "i.e" n := l.peek() if unicode.IsLetter(n) { l.accept() // accept . l.next() l.accept() return lexText } default: } accepted := l.acceptRunFn(unicode.IsPunct) // check for any other runs of punctuations punct := unicode.IsPunct(next) if accepted == 0 && punct { l.accept() } l.emit(lingo.Punctuation) if accepted == 0 && !punct && !unicode.IsSpace(next) { return lexText } return lexWhitespace } func lexSymbol(l *Lexer) (fn stateFn) { l.acceptRunFn(unicode.IsSymbol) l.acceptRunFn(unicode.IsPunct) // any symbol punctuation combination should be treated as a symbole l.emit(lingo.Symbol) return lexWhitespace } func lexURI(l *Lexer) (fn stateFn) { eof := l.nextUntilEOF(" ") if !eof { l.backup() l.backup() next := l.next() if unicode.IsPunct(next) { l.backup() l.emit(lingo.URI) return lexPunctuation } } l.emit(lingo.URI) return lexWhitespace } func lexDate(l *Lexer) (fn stateFn) { l.acceptRunFn(unicode.IsDigit) next := l.next() if next != '/' && next != '-' { l.backup() l.emit(lingo.Number) // fractions are numbers return lexWhitespace } l.r = '/' // standardize l.accept() l.acceptRunFn(unicode.IsDigit) l.emit(lingo.Date) return lexWhitespace } func lexTime(l *Lexer) (fn stateFn) { l.acceptRunFn(unicode.IsDigit) next := l.next() if next != ':' { l.backup() l.emit(lingo.Time) return lexWhitespace } l.accept() l.acceptRunFn(unicode.IsDigit) l.emit(lingo.Time) return lexWhitespace } ================================================ FILE: lingo.go ================================================ // package lingo provides the data structures and algorithms required for natural language processing. package lingo ================================================ FILE: pos/allinone_test.go ================================================ package pos import ( "log" "strings" "testing" "github.com/chewxy/lingo" "github.com/chewxy/lingo/lexer" "github.com/chewxy/lingo/treebank" ) func TestEverything(t *testing.T) { sentences := treebank.ReadConllu(strings.NewReader(conllu)) sentence := "President Bush comes on federal courts." p := New(WithCluster(clusters), WithLemmatizer(dummyLem{}), WithStemmer(dummyStemmer{})) p.Train(sentences, 200) l := lexer.New(sentence, strings.NewReader(sentence)) p2 := p.Clone() p2.Input = l.Output var correct string if lingo.BUILD_TAGSET == "stanfordtags" { correct = "-ROOT-/ROOT_TAG President/NNP Bush/NNP comes/DT on/IN federal/JJ courts/NN ./FULLSTOP" } else { correct = "-ROOT-/ROOT_TAG President/PROPN Bush/PROPN comes/VERB on/ADP federal/ADJ courts/NOUN ./PUNCT" } go l.Run() go p2.Run() for a := range p2.Output { // this clearly isn't gonna be accurate, given the stubbed out Lemmatizer if a.String() != correct { t.Error("Something went wrong with the POSTagging") log.Printf("%v", a) } } } ================================================ FILE: pos/context.go ================================================ package pos import ( "strconv" "github.com/chewxy/lingo" ) /* A context is which word in the current state the POSTagger is in. There are so far 5 contexts: - Previous previous word - previous word - current word - next word - next next word For each context we have 8 features: - word (lower case) - lemma - cluster - shape - prefix (first 1) - suffix (last 3) - POSTag - wordflag */ //go:generate stringer -type=contextType type contextType byte const featuresPerContext = 8 const contexts = 5 const ( // previous previous (prev2) prev2Word contextType = iota prev2Lemma prev2Cluster prev2Shape prev2Prefix1 prev2Suffix3 prev2POSTag prev2Flags // previous prevWord prevLemma prevCluster prevShape prevPrefix1 prevSuffix3 prevPOSTag prevFlags // ith token ithWord ithLemma ithCluster ithShape ithPrefix1 ithSuffix3 ithPOSTag ithFlags // next token nextWord nextLemma nextCluster nextShape nextPrefix1 nextSuffix3 nextPOSTag nextFlags // next next token next2Word next2Lemma next2Cluster next2Shape next2Prefix1 next2Suffix3 next2POSTag next2Flags MAXCONTEXTTYPE ) type contextMap [MAXCONTEXTTYPE]string func getContext(prev2, prev, ith, next, next2 *lingo.Annotation) (retVal contextMap) { var listOfFeats = [contexts][featuresPerContext]string{ extractContext(prev2), extractContext(prev), extractContext(ith), extractContext(next), extractContext(next2), } for i, l := range listOfFeats { for j, s := range l { retVal[i*featuresPerContext+j] = s } } return retVal } // type featureContext struct { // word string // lemma string // cluster lingo.Cluster // shape string // prefix string // suffix string // POSTag lingo.POSTag // flag lingo.WordFlag // } // extractContext extracts the feature contexts from a given annotation func extractContext(a *lingo.Annotation) (retVal [featuresPerContext]string) { if a == nil { return retVal } word := a.Lowered // we normalize all the unicode btes first asRunes := []rune(a.Value) loweredRunes := []rune(word) retVal[0] = word retVal[1] = a.Lemma retVal[2] = strconv.Itoa(int(a.Cluster)) retVal[3] = string(a.Shape) // prefix and suffix // we want the characters, not the bytes // for the prefix, we'll use the un-normalized version because having that extra fidelity would be useful if len(asRunes) > 0 { retVal[4] = string(asRunes[0]) } else { retVal[4] = "" } if len(loweredRunes) >= 3 { retVal[5] = string(loweredRunes[len(loweredRunes)-3 : len(loweredRunes)]) } else { retVal[5] = "" } retVal[6] = a.POSTag.String() retVal[7] = a.WordFlag.String() return retVal } ================================================ FILE: pos/context_test.go ================================================ package pos import ( "strings" "testing" "github.com/chewxy/lingo" ) var extractContextTest = []struct { val string tag lingo.POSTag shape string pref string suff string flag string clust string }{ {"TEst", lingo.ROOT_TAG, "XXxx", "T", "est", "00000000000110", "1"}, {"TEst", lingo.X, "XXxx", "T", "est", "00000000000110", "1"}, {"NotInClust", lingo.UNKNOWN_TAG, "XxxXxXxxxx", "N", "ust", "00000000000110", "0"}, {"", lingo.X, "", "", "", "00000101111110", "0"}, } func TestExtractContext(t *testing.T) { for i, ects := range extractContextTest { a := lingo.StringToAnnotation(ects.val, dummyFix{}) a.POSTag = ects.tag res := extractContext(a) if res[0] != strings.ToLower(ects.val) { t.Errorf("Test %d: Expected word feature to be %q. Got %q instead", i, strings.ToLower(ects.val), res[0]) } if res[2] != ects.clust { t.Errorf("Test %d: Expected cluster to be %q. Got %q instead", i, ects.clust, res[2]) } if res[3] != ects.shape { t.Errorf("Test %d: Expected shape to be %q. Got %q instead", i, ects.shape, res[3]) } if res[4] != ects.pref { t.Errorf("Test %d: Expected prefix to be %q. Got %q instead", i, ects.pref, res[4]) } if res[5] != ects.suff { t.Errorf("Test %d: Expected suffix to be %q. Got %q instead", i, ects.suff, res[5]) } if res[6] != ects.tag.String() { t.Errorf("Test %d: Expected postag to be %q. Got %q instead", i, ects.tag, res[6]) } if res[7] != ects.flag { t.Errorf("Test %d: Expected flag to be %q. Got %q instead", i, ects.flag, res[7]) } } } ================================================ FILE: pos/contexttype_string.go ================================================ // generated by stringer -type=contextType; DO NOT EDIT package pos import "fmt" const _contextType_name = "prev2Wordprev2Lemmaprev2Clusterprev2Shapeprev2Prefix1prev2Suffix3prev2POSTagprev2FlagsprevWordprevLemmaprevClusterprevShapeprevPrefix1prevSuffix3prevPOSTagprevFlagsithWordithLemmaithClusterithShapeithPrefix1ithSuffix3ithPOSTagithFlagsnextWordnextLemmanextClusternextShapenextPrefix1nextSuffix3nextPOSTagnextFlagsnext2Wordnext2Lemmanext2Clusternext2Shapenext2Prefix1next2Suffix3next2POSTagnext2FlagsMAXCONTEXTTYPE" var _contextType_index = [...]uint16{0, 9, 19, 31, 41, 53, 65, 76, 86, 94, 103, 114, 123, 134, 145, 155, 164, 171, 179, 189, 197, 207, 217, 226, 234, 242, 251, 262, 271, 282, 293, 303, 312, 321, 331, 343, 353, 365, 377, 388, 398, 412} func (i contextType) String() string { if i >= contextType(len(_contextType_index)-1) { return fmt.Sprintf("contextType(%d)", i) } return _contextType_name[_contextType_index[i]:_contextType_index[i+1]] } ================================================ FILE: pos/debug.go ================================================ // +build debug package pos import ( "log" "strings" "sync/atomic" ) const BUILD_DEBUG = "POS TAGGER: Debug Build" var TABCOUNT uint32 = 0 var tracking = false func tabcount() int { return int(atomic.LoadUint32(&TABCOUNT)) } func enterLoggingContext() { atomic.AddUint32(&TABCOUNT, 1) tc := tabcount() log.SetPrefix(strings.Repeat("\t", tc)) } func leaveLoggingContext() { tc := tabcount() tc-- if tc < 0 { atomic.StoreUint32(&TABCOUNT, 0) tc = 0 } else { atomic.StoreUint32(&TABCOUNT, uint32(tc)) } log.SetPrefix(strings.Repeat("\t", tc)) } func logf(format string, others ...interface{}) { log.Printf(format, others...) } func recoverFrom(format string, attrs ...interface{}) { if r := recover(); r != nil { log.Printf(format, attrs...) panic(r) } } ================================================ FILE: pos/errors.go ================================================ package pos import "fmt" type componentUnavailable string func (c componentUnavailable) Error() string { return fmt.Sprintf("%v unavailable", c) } func (c componentUnavailable) Component() string { return string(c) } ================================================ FILE: pos/features.go ================================================ package pos import ( "bytes" "fmt" "github.com/chewxy/lingo" ) type featureType byte //go:generate stringer -type=featureType const ( bias featureType = iota ithWord_ nextWord_ next2Word_ ithSuffix3_ ithPrefix1_ prevPOSTag_ prev2POSTag_ prevSuffix3_ nextSuffix3_ ithShape_ ithCluster_ nextCluster_ next2Cluster_ prevCluster_ prev2Cluster_ ithFlags_ nextFlags_ next2Flags_ prevFlags_ prev2Flags_ prevLemma_prevPOSTag prevPOSTag_ithWord prevPOSTag_prev2POSTag prev2Lemma_prev2POSTag MAXFEATURETYPE ) var featCtxMap = map[featureType]contextType{ ithWord_: ithWord, nextWord_: nextWord, next2Word_: next2Word, ithSuffix3_: ithSuffix3, ithPrefix1_: ithPrefix1, prevPOSTag_: prevPOSTag, prev2POSTag_: prev2POSTag, prevSuffix3_: prevSuffix3, nextSuffix3_: nextSuffix3, ithShape_: ithShape, ithCluster_: ithCluster, nextCluster_: nextCluster, next2Cluster_: next2Cluster, prevCluster_: prevCluster, prev2Cluster_: prev2Cluster, ithFlags_: ithFlags, nextFlags_: nextFlags, next2Flags_: next2Flags, prevFlags_: prevFlags, prev2Flags_: prev2Flags, } type feature interface { FeatType() featureType String() string } type singleFeature struct { featureType value string } func (sf singleFeature) FeatType() featureType { return sf.featureType } func (sf singleFeature) String() string { return fmt.Sprintf("singleFeature{%v, %q}", sf.featureType, sf.value) } type tupleFeature struct { featureType value1 string value2 string } func (tf tupleFeature) FeatType() featureType { return tf.featureType } func (tf tupleFeature) String() string { return fmt.Sprintf("tupleFeature {%v, %q, %q}", tf.featureType, tf.value1, tf.value2) } type featureMap map[feature]float64 func (fm featureMap) String() string { var buf bytes.Buffer for f := range fm { fmt.Fprintf(&buf, "%s: 1,\n", f) } return buf.String() } func (fm *featureMap) add(f feature) { (*fm)[f]++ } type sfFeatures [prevLemma_prevPOSTag]singleFeature type tfFeatures [MAXFEATURETYPE - prevLemma_prevPOSTag]tupleFeature func fillFromContext(c contextMap) (sf sfFeatures, tf tfFeatures) { for i := bias; i < prevLemma_prevPOSTag; i++ { sf[i] = singleFeature{i, c[featCtxMap[i]]} } const last = prevLemma_prevPOSTag tf[prevLemma_prevPOSTag-last] = tupleFeature{prevLemma_prevPOSTag, c[prevLemma], c[prevPOSTag]} tf[prevPOSTag_ithWord-last] = tupleFeature{prevPOSTag_ithWord, c[prevPOSTag], c[ithWord]} tf[prevPOSTag_prev2POSTag-last] = tupleFeature{prevPOSTag_prev2POSTag, c[prevPOSTag], c[prev2POSTag]} tf[prev2Lemma_prev2POSTag-last] = tupleFeature{prev2Lemma_prev2POSTag, c[prev2Lemma], c[prev2POSTag]} return } func getFeatures(s lingo.AnnotatedSentence, i int) (sfFeatures, tfFeatures) { length := len(s) // set up context defaults prev2 := lingo.NullAnnotation() prev := lingo.NullAnnotation() ith := s[i] next := lingo.NullAnnotation() next2 := lingo.NullAnnotation() if i-1 >= 0 { prev = s[i-1] } if i-2 >= 0 { prev2 = s[i-2] } if i+1 < length { next = s[i+1] } if i+2 < length { next2 = s[i+2] } c := getContext(prev2, prev, ith, next, next2) return fillFromContext(c) } ================================================ FILE: pos/features_test.go ================================================ // +build stanfordtags package pos import ( "testing" "github.com/chewxy/lingo" "github.com/stretchr/testify/assert" ) func TestGetFeatures(t *testing.T) { assert := assert.New(t) // test two word sentence s2 := lingo.AnnotatedSentence{ lingo.AnnotationFromLexTag(lingo.Lexeme{"most", lingo.Word, -1, -1}, lingo.RBS, dummyFix{}), lingo.AnnotationFromLexTag(lingo.Lexeme{"populous", lingo.Word, -1, -1}, lingo.X, dummyFix{}), } featMap := getFeatures(s2, 0) expectedFM := featureMap{ singleFeature{bias, ""}: 1, singleFeature{ithWord_, "most"}: 1, tupleFeature{prevLemma_prevPOSTag, "", "X"}: 1, tupleFeature{prev2Lemma_prev2POSTag, "", "X"}: 1, singleFeature{nextWord_, "populous"}: 1, singleFeature{next2Word_, ""}: 1, singleFeature{ithSuffix3_, "ost"}: 1, singleFeature{ithPrefix1_, "m"}: 1, singleFeature{prevPOSTag_, "X"}: 1, singleFeature{prev2POSTag_, "X"}: 1, tupleFeature{prevPOSTag_prev2_POSTag, "X", "X"}: 1, tupleFeature{prevPOSTag_ithWord, "X", "most"}: 1, singleFeature{prevSuffix3_, ""}: 1, singleFeature{nextSuffix3_, "ous"}: 1, singleFeature{ithShape_, "xxxx"}: 1, singleFeature{ithCluster_, "0"}: 1, singleFeature{nextCluster_, "0"}: 1, singleFeature{next2Cluster_, "0"}: 1, singleFeature{prevCluster_, "0"}: 1, singleFeature{prev2Cluster_, "0"}: 1, singleFeature{ithFlags_, "01000000010110"}: 1, singleFeature{nextFlags_, "00000000010110"}: 1, singleFeature{next2Flags_, "00000000000000"}: 1, singleFeature{prevFlags_, "00000000000000"}: 1, singleFeature{prev2Flags_, "00000000000000"}: 1, } assert.EqualValues(expectedFM, featMap, "Want: \n%v\n\nGot: \n%v", expectedFM, featMap) // test five word sentence s5 := lingo.AnnotatedSentence{ lingo.AnnotationFromLexTag(lingo.Lexeme{"most", lingo.Word, -1, -1}, lingo.RBS, dummyFix{}), lingo.AnnotationFromLexTag(lingo.Lexeme{"populous", lingo.Word, -1, -1}, lingo.X, dummyFix{}), lingo.AnnotationFromLexTag(lingo.Lexeme{"state", lingo.Word, -1, -1}, lingo.X, dummyFix{}), lingo.AnnotationFromLexTag(lingo.Lexeme{"in", lingo.Word, -1, -1}, lingo.X, dummyFix{}), lingo.AnnotationFromLexTag(lingo.Lexeme{"America", lingo.Word, -1, -1}, lingo.X, dummyFix{}), } featMap = getFeatures(s5, 0) // no prev expectedFM = featureMap{ singleFeature{bias, ""}: 1, singleFeature{ithWord_, "most"}: 1, tupleFeature{prevLemma_prevPOSTag, "", "X"}: 1, tupleFeature{prev2Lemma_prev2POSTag, "", "X"}: 1, singleFeature{nextWord_, "populous"}: 1, singleFeature{next2Word_, "state"}: 1, singleFeature{ithSuffix3_, "ost"}: 1, singleFeature{ithPrefix1_, "m"}: 1, singleFeature{prevPOSTag_, "X"}: 1, singleFeature{prev2POSTag_, "X"}: 1, tupleFeature{prevPOSTag_prev2_POSTag, "X", "X"}: 1, tupleFeature{prevPOSTag_ithWord, "X", "most"}: 1, singleFeature{prevSuffix3_, ""}: 1, singleFeature{nextSuffix3_, "ous"}: 1, singleFeature{ithShape_, "xxxx"}: 1, singleFeature{ithCluster_, "0"}: 1, singleFeature{nextCluster_, "0"}: 1, singleFeature{next2Cluster_, "0"}: 1, singleFeature{prevCluster_, "0"}: 1, singleFeature{prev2Cluster_, "0"}: 1, singleFeature{ithFlags_, "01000000010110"}: 1, singleFeature{nextFlags_, "00000000010110"}: 1, singleFeature{next2Flags_, "00000000010110"}: 1, singleFeature{prevFlags_, "00000000000000"}: 1, singleFeature{prev2Flags_, "00000000000000"}: 1, } assert.EqualValues(expectedFM, featMap, "Want: \n%v\n\nGot: \n%v", expectedFM, featMap) featMap = getFeatures(s5, 2) // has all the feats expectedFM = featureMap{ singleFeature{bias, ""}: 1, singleFeature{ithWord_, "state"}: 1, tupleFeature{prev2Lemma_prev2POSTag, "", "RBS"}: 1, tupleFeature{prevLemma_prevPOSTag, "", "X"}: 1, singleFeature{nextWord_, "in"}: 1, singleFeature{next2Word_, "america"}: 1, singleFeature{ithSuffix3_, "ate"}: 1, singleFeature{ithPrefix1_, "s"}: 1, singleFeature{prevPOSTag_, "X"}: 1, singleFeature{prev2POSTag_, "RBS"}: 1, tupleFeature{prevPOSTag_prev2_POSTag, "X", "RBS"}: 1, tupleFeature{prevPOSTag_ithWord, "X", "state"}: 1, singleFeature{prevSuffix3_, "ous"}: 1, singleFeature{nextSuffix3_, ""}: 1, singleFeature{ithShape_, "xxxx"}: 1, singleFeature{ithCluster_, "0"}: 1, singleFeature{nextCluster_, "0"}: 1, singleFeature{next2Cluster_, "0"}: 1, singleFeature{prevCluster_, "0"}: 1, singleFeature{prev2Cluster_, "0"}: 1, singleFeature{ithFlags_, "00000000010110"}: 1, singleFeature{nextFlags_, "01000000010110"}: 1, singleFeature{next2Flags_, "00000010000110"}: 1, singleFeature{prevFlags_, "00000000010110"}: 1, singleFeature{prev2Flags_, "01000000010110"}: 1, } assert.EqualValues(expectedFM, featMap, "Want: \n%v\n\nGot: \n%v", expectedFM, featMap) featMap = getFeatures(s5, 4) // no nexts expectedFM = featureMap{ singleFeature{bias, ""}: 1, singleFeature{ithWord_, "america"}: 1, tupleFeature{prev2Lemma_prev2POSTag, "", "X"}: 1, tupleFeature{prevLemma_prevPOSTag, "", "X"}: 1, singleFeature{nextWord_, ""}: 1, singleFeature{next2Word_, ""}: 1, singleFeature{ithSuffix3_, "ica"}: 1, singleFeature{ithPrefix1_, "A"}: 1, singleFeature{prevPOSTag_, "X"}: 1, singleFeature{prev2POSTag_, "X"}: 1, tupleFeature{prevPOSTag_prev2_POSTag, "X", "X"}: 1, tupleFeature{prevPOSTag_ithWord, "X", "america"}: 1, singleFeature{prevSuffix3_, ""}: 1, singleFeature{nextSuffix3_, ""}: 1, singleFeature{ithShape_, "Xxxxx"}: 1, singleFeature{ithCluster_, "0"}: 1, singleFeature{nextCluster_, "0"}: 1, singleFeature{next2Cluster_, "0"}: 1, singleFeature{prevCluster_, "0"}: 1, singleFeature{prev2Cluster_, "0"}: 1, singleFeature{ithFlags_, "00000010000110"}: 1, singleFeature{nextFlags_, "00000000000000"}: 1, singleFeature{next2Flags_, "00000000000000"}: 1, singleFeature{prevFlags_, "01000000010110"}: 1, singleFeature{prev2Flags_, "00000000010110"}: 1, } assert.EqualValues(expectedFM, featMap, "Want: \n%v\n\nGot: \n%v", expectedFM, featMap) } ================================================ FILE: pos/featuretype_string.go ================================================ // generated by stringer -type=featureType; DO NOT EDIT package pos import "fmt" const _featureType_name = "biasithWord_prevLemma_prevPOSTagprev2Lemma_prev2POSTagnextWord_next2Word_ithSuffix3_ithPrefix1_prevPOSTag_prev2POSTag_prevPOSTag_prev2_POSTagprevPOSTag_ithWordprevSuffix3_nextSuffix3_ithShape_ithCluster_nextCluster_next2Cluster_prevCluster_prev2Cluster_ithFlags_nextFlags_next2Flags_prevFlags_prev2Flags_MAXFEATURETYPE" var _featureType_index = [...]uint16{0, 4, 12, 32, 54, 63, 73, 84, 95, 106, 118, 141, 159, 171, 183, 192, 203, 215, 228, 240, 253, 262, 272, 283, 293, 304, 318} func (i featureType) String() string { if i >= featureType(len(_featureType_index)-1) { return fmt.Sprintf("featureType(%d)", i) } return _featureType_name[_featureType_index[i]:_featureType_index[i+1]] } ================================================ FILE: pos/models.go ================================================ package pos import ( "bufio" "encoding/gob" "io" "os" "github.com/chewxy/lingo" ) // Model is the model that the POS Tagger runs on. type Model struct { *perceptron cachedTags map[string]lingo.POSTag } // Save saves the model func (m *Model) Save(filename string) error { f, err := os.Create(filename) if err != nil { return err } return m.SaveWriter(f) } func (m *Model) SaveWriter(f io.WriteCloser) error { defer f.Close() w := bufio.NewWriter(f) defer w.Flush() encoder := gob.NewEncoder(w) if err := encoder.Encode(m.perceptron); err != nil { return err } if err := encoder.Encode(m.cachedTags); err != nil { return err } return nil } func Load(filename string) (*Model, error) { f, err := os.Open(filename) if err != nil { return nil, err } return LoadReader(f) } func LoadReader(rd io.ReadCloser) (*Model, error) { defer rd.Close() r := bufio.NewReader(rd) decoder := gob.NewDecoder(r) m := &Model{ perceptron: newPerceptron(), } if err := decoder.Decode(m.perceptron); err != nil { return nil, err } if err := decoder.Decode(&m.cachedTags); err != nil { return nil, err } return m, nil } func (p *Tagger) Load(filename string) error { m, err := Load(filename) if err != nil { return err } p.Model = m return nil } ================================================ FILE: pos/models_test.go ================================================ package pos import ( "os" "strings" "testing" "github.com/chewxy/lingo/treebank" "github.com/stretchr/testify/assert" ) func TestSaveLoad(t *testing.T) { pt := New() sentences := treebank.ReadConllu(strings.NewReader(conllu)) pt.Train(sentences, 5) pt.Save("test.dat") pt2 := New() if err := pt2.Load("test.dat"); err != nil { os.Remove("test.dat") t.Fatal(err) } assert := assert.New(t) assert.Equal(pt.perceptron, pt2.perceptron, "POSTaggers' perceptrons are different:%p %p", pt.perceptron, pt2.perceptron) assert.Equal(pt.cachedTags, pt2.cachedTags, "POSTaggers' cachedTags are different") // cleanup os.Remove("test.dat") } ================================================ FILE: pos/perceptron.go ================================================ package pos import "github.com/chewxy/lingo" type perceptron struct { // weights map[feature]*[lingo.MAXTAG]float64 // it's a pointer to a static array because map values are immutable, and cannot be edited weightsSF map[singleFeature]*[lingo.MAXTAG]float64 weightsTF map[tupleFeature]*[lingo.MAXTAG]float64 totals map[fctuple]float64 steps map[fctuple]float64 instancesSeen float64 } // feature-class tuple is a tuple that contains a feature and a class. This makes calculation of the averaging easier type fctuple struct { feature lingo.POSTag } func newPerceptron() *perceptron { return &perceptron{ // weights: make(map[feature]*[lingo.MAXTAG]float64), weightsSF: make(map[singleFeature]*[lingo.MAXTAG]float64), weightsTF: make(map[tupleFeature]*[lingo.MAXTAG]float64), totals: make(map[fctuple]float64), steps: make(map[fctuple]float64), } } func (p *perceptron) updateWeightsSF(f singleFeature, tag lingo.POSTag, weight, value float64) { tuple := fctuple{f, tag} p.totals[tuple] += (p.instancesSeen - p.steps[tuple]) * weight p.steps[tuple] = p.instancesSeen if _, ok := p.weightsSF[f]; !ok { p.weightsSF[f] = new([lingo.MAXTAG]float64) } p.weightsSF[f][tag] = weight + value } func (p *perceptron) updateWeightsTF(f tupleFeature, tag lingo.POSTag, weight, value float64) { tuple := fctuple{f, tag} p.totals[tuple] += (p.instancesSeen - p.steps[tuple]) * weight p.steps[tuple] = p.instancesSeen if _, ok := p.weightsTF[f]; !ok { p.weightsTF[f] = new([lingo.MAXTAG]float64) } p.weightsTF[f][tag] = weight + value } func (p *perceptron) update(guess, truth lingo.POSTag, sf sfFeatures, tf tfFeatures) { p.instancesSeen++ if truth == guess { return } for _, f := range sf { var truthValue float64 var guessValue float64 if weights, ok := p.weightsSF[f]; ok { truthValue = weights[truth] guessValue = weights[guess] } p.updateWeightsSF(f, truth, truthValue, 1) p.updateWeightsSF(f, guess, guessValue, -1) } for _, f := range tf { var truthValue float64 var guessValue float64 if weights, ok := p.weightsTF[f]; ok { truthValue = weights[truth] guessValue = weights[guess] } p.updateWeightsTF(f, truth, truthValue, 1) p.updateWeightsTF(f, guess, guessValue, -1) } } func (p *perceptron) predict(sf sfFeatures, tf tfFeatures) lingo.POSTag { var scores [lingo.MAXTAG]float64 for _, f := range sf { if weights, ok := p.weightsSF[f]; ok { for label, weight := range weights { scores[label] += weight } } } for _, f := range tf { if weights, ok := p.weightsTF[f]; ok { for label, weight := range weights { scores[label] += weight } } } return maxScore(&scores) } func (p *perceptron) average() { for f, weights := range p.weightsSF { for c, weight := range weights { tuple := fctuple{f, lingo.POSTag(c)} total := p.totals[tuple] total += (p.instancesSeen - p.steps[tuple]) * weight avg := total / p.instancesSeen weights[c] = avg } } for f, weights := range p.weightsTF { for c, weight := range weights { tuple := fctuple{f, lingo.POSTag(c)} total := p.totals[tuple] total += (p.instancesSeen - p.steps[tuple]) * weight avg := total / p.instancesSeen weights[c] = avg } } } ================================================ FILE: pos/perceptron_io.go ================================================ package pos import ( "bytes" "encoding/gob" ) /* Feature Gob interface */ func (sf singleFeature) GobEncode() ([]byte, error) { var buf bytes.Buffer encoder := gob.NewEncoder(&buf) if err := encoder.Encode(sf.featureType); err != nil { return nil, err } if err := encoder.Encode(sf.value); err != nil { return nil, err } return buf.Bytes(), nil } func (sf *singleFeature) GobDecode(buf []byte) error { b := bytes.NewBuffer(buf) decoder := gob.NewDecoder(b) if err := decoder.Decode(&sf.featureType); err != nil { return err } if err := decoder.Decode(&sf.value); err != nil { return err } return nil } func (tf tupleFeature) GobEncode() ([]byte, error) { var buf bytes.Buffer encoder := gob.NewEncoder(&buf) if err := encoder.Encode(tf.featureType); err != nil { return nil, err } if err := encoder.Encode(tf.value1); err != nil { return nil, err } if err := encoder.Encode(tf.value2); err != nil { return nil, err } return buf.Bytes(), nil } func (tf *tupleFeature) GobDecode(buf []byte) error { b := bytes.NewBuffer(buf) decoder := gob.NewDecoder(b) if err := decoder.Decode(&tf.featureType); err != nil { return err } if err := decoder.Decode(&tf.value1); err != nil { return err } if err := decoder.Decode(&tf.value2); err != nil { return err } return nil } /* fctuple Gob Interface */ func (fc fctuple) GobEncode() ([]byte, error) { var buf bytes.Buffer encoder := gob.NewEncoder(&buf) if err := encoder.Encode(&fc.feature); err != nil { return nil, err } if err := encoder.Encode(fc.POSTag); err != nil { return nil, err } return buf.Bytes(), nil } func (fc *fctuple) GobDecode(buf []byte) error { b := bytes.NewBuffer(buf) decoder := gob.NewDecoder(b) if err := decoder.Decode(&fc.feature); err != nil { return err } if err := decoder.Decode(&fc.POSTag); err != nil { return err } return nil } /* Perceptron Gob Interface */ func (p *perceptron) GobEncode() ([]byte, error) { var buf bytes.Buffer encoder := gob.NewEncoder(&buf) // if err := encoder.Encode(&p.weights); err != nil { // return nil, err // } if err := encoder.Encode(&p.weightsSF); err != nil { return nil, err } if err := encoder.Encode(&p.weightsTF); err != nil { return nil, err } if err := encoder.Encode(&p.totals); err != nil { return nil, err } if err := encoder.Encode(&p.steps); err != nil { return nil, err } if err := encoder.Encode(p.instancesSeen); err != nil { return nil, err } return buf.Bytes(), nil } func (p *perceptron) GobDecode(buf []byte) error { b := bytes.NewBuffer(buf) decoder := gob.NewDecoder(b) // if err := decoder.Decode(&p.weights); err != nil { // return err // } if err := decoder.Decode(&p.weightsSF); err != nil { return err } if err := decoder.Decode(&p.weightsTF); err != nil { return err } if err := decoder.Decode(&p.totals); err != nil { return err } if err := decoder.Decode(&p.steps); err != nil { return err } if err := decoder.Decode(&p.instancesSeen); err != nil { return err } return nil } func init() { gob.Register(singleFeature{}) gob.Register(tupleFeature{}) } ================================================ FILE: pos/perceptron_io_test.go ================================================ // +build stanfordtags package pos import ( "bytes" "encoding/gob" "testing" "github.com/chewxy/lingo" "github.com/stretchr/testify/assert" ) func TestFeatureSerialization(t *testing.T) { var f, f2 feature f = singleFeature{ithWord_, "hello"} f2 = tupleFeature{ithWord_, "hello", "world"} var buf bytes.Buffer encoder := gob.NewEncoder(&buf) decoder := gob.NewDecoder(&buf) if err := encoder.Encode(&f); err != nil { t.Fatal(err) } if err := encoder.Encode(&f2); err != nil { t.Fatal(err) } var decodedF, decodedF2 feature if err := decoder.Decode(&decodedF); err != nil { t.Fatal(err) } if err := decoder.Decode(&decodedF2); err != nil { t.Fatal(err) } assert.Equal(t, f, decodedF, "feature not deserialized properly") assert.Equal(t, f2, decodedF2, "feature not deserialized properly") } func TestPerceptron_Serialize(t *testing.T) { p := newPerceptron() // set up a dummy weight f := singleFeature{ithWord_, "hello"} w := new([lingo.MAXTAG]float64) w[lingo.NN] = 0.5 w[lingo.VB] = 0.1 p.weights[f] = w fc := fctuple{f, lingo.VB} p.totals[fc] = 0.1337 p.steps[fc] = 0.65535 p.instancesSeen = 1022 var buf bytes.Buffer encoder := gob.NewEncoder(&buf) decoder := gob.NewDecoder(&buf) // encode if err := encoder.Encode(p); err != nil { t.Fatal(err) } // decode p2 := newPerceptron() if err := decoder.Decode(p2); err != nil { t.Fatal(err) } assert := assert.New(t) assert.Equal(p.weights, p2.weights, "The weights have not been deserialized properly") assert.Equal(p.totals, p2.totals, "Totals have not been deserialized properly") assert.Equal(p.steps, p2.steps, "Steps have not been deserialized properly") assert.Equal(p.instancesSeen, p2.instancesSeen, "InstancesSeen not deserialized properly") } ================================================ FILE: pos/postagger.go ================================================ package pos import ( "github.com/chewxy/lingo" "github.com/chewxy/lingo/corpus" "github.com/chewxy/lingo/treebank" ) // Tagger is the object that tags an incoming channel of lexemes, // and outputs a channel of AnnotatedSentence. Each of the Annotation // are tagged with the POSTag // // The core of the Tagger is the perceptron (unexported). // // A large percentage of how this POS Tagger works is inspired by Mathhew Honnibal's work in SpaCy type Tagger struct { *Model Input chan lingo.Lexeme Output chan lingo.AnnotatedSentence progress chan Progress sentences chan lingo.AnnotatedSentence lingo.Lemmatizer lingo.Stemmer corpus *corpus.Corpus clusters map[string]lingo.Cluster // this map is safe for concurrent access because it's readonly } // ConsOpt is a construction option for a Tagger type ConsOpt func(*Tagger) // WithCorpus creates a *Tagger with an existing Corpus func WithCorpus(c *corpus.Corpus) ConsOpt { fn := func(p *Tagger) { p.corpus = c } return fn } // WithLemmatizer creates a *Tagger with a lemmatizer. // If no lemmatizer is passed into the POSTagger, then the lemmatization process will be skipped, and the POSTagger will be less accurate func WithLemmatizer(l lingo.Lemmatizer) ConsOpt { fn := func(p *Tagger) { p.Lemmatizer = l } return fn } // WithStemmer creates a *Tagger with a stemmer. // If no stemmer is passed in, then the stemming will be skipped, and the POSTagger will be less accurate func WithStemmer(s lingo.Stemmer) ConsOpt { fn := func(p *Tagger) { p.Stemmer = s } return fn } // WithCluster creates a *Tagger with a brown cluster corpus (a map of strings to the brown clusters). // If no brown cluster corpus was passed in, the cluster won't be set, and the POSTagger will be less accurate func WithCluster(c map[string]lingo.Cluster) ConsOpt { fn := func(p *Tagger) { p.clusters = c } return fn } // WithModel creates a *Tagger with the specified model func WithModel(m *Model) ConsOpt { fn := func(p *Tagger) { p.Model = m } return fn } // New creates a new *Tagger func New(opts ...ConsOpt) *Tagger { p := &Tagger{ Output: make(chan lingo.AnnotatedSentence), sentences: make(chan lingo.AnnotatedSentence), } for _, opt := range opts { opt(p) } if p.Model == nil { p.Model = &Model{perceptron: newPerceptron()} p.cachedTags = make(map[string]lingo.POSTag) } return p } // Clone() makes a copy of a POSTagger func (p *Tagger) Clone() *Tagger { return &Tagger{ Model: p.Model, corpus: p.corpus, Output: make(chan lingo.AnnotatedSentence), sentences: make(chan lingo.AnnotatedSentence), Lemmatizer: p.Lemmatizer, Stemmer: p.Stemmer, clusters: p.clusters, } } // Run is used to tag a sentence. Lexemes arrive from the lexer in a channel (*Tagger.Input), and an annotated sentence is sent down the Output channel func (p *Tagger) Run() { defer close(p.Output) go p.getSentences() for s := range p.sentences { length := len(s) if length == 0 { continue } for i, a := range s { tag, ok := p.shortcut(a.Lexeme) if !ok { sf, tf := getFeatures(s, i) tag = p.perceptron.predict(sf, tf) } p.setTag(a, tag) } p.Output <- s } } // Lemmatize implements the lingo.Lemmatize interface. It however, defers the actual doing of the job to the Lemmatizer. func (p *Tagger) Lemmatize(a string, pt lingo.POSTag) ([]string, error) { if p.Lemmatizer == nil { return nil, componentUnavailable("lemmatizer") } return p.Lemmatizer.Lemmatize(a, pt) } // Stem implements the lingo.Stemmer interface. It however, defers the actual stemming to the stemmer passed in. func (p *Tagger) Stem(a string) (string, error) { if p.Stemmer == nil { return "", componentUnavailable("stemmer") } return p.Stemmer.Stem(a) } // Clusters implements the lingo.AnnotationFixer interface. func (p *Tagger) Clusters() (map[string]lingo.Cluster, error) { if p.clusters == nil { return nil, componentUnavailable("clusters") } return p.clusters, nil } // Progress creates and returns a channel of progress. By default the progress channel isn't created, and no progress info is sent func (p *Tagger) Progress() <-chan Progress { if p.progress == nil { p.progress = make(chan Progress) } return p.progress } // Train trains a POSTagger, given a bunch of SentenceTags func (p *Tagger) Train(sentences []treebank.SentenceTag, iterations int) { if p.progress != nil { defer func() { close(p.progress) p.progress = nil }() } p.fillCache(sentences) // Somehow sentenceTag.AnnotatedSentence() is memory leaky. // As a result, the more training iterations there is, the more memory is used and not released // hence the cache is necessary. cache := make(map[string]lingo.AnnotatedSentence) for iter := 0; iter < iterations; iter++ { c := 0 n := 0 shortcutted := 0 var s lingo.AnnotatedSentence for _, sentenceTag := range sentences { tags := []lingo.POSTag{lingo.ROOT_TAG} tags = append(tags, sentenceTag.Tags...) var ok bool if s, ok = cache[sentenceTag.String()]; !ok { s = sentenceTag.AnnotatedSentence(p) // the fixer is used to extract cluster information, etc into the *Annotation cache[sentenceTag.String()] = s } length := len(s) if length == 0 { continue } for _, a := range s { if a == lingo.RootAnnotation() { continue } a.POSTag = lingo.X } for i, a := range s { // processing truth := tags[i] guess, ok := p.shortcut(a.Lexeme) if !ok { sf, tf := getFeatures(s, i) guess = p.perceptron.predict(sf, tf) p.perceptron.update(guess, truth, sf, tf) } else { shortcutted++ } p.setTag(a, guess) if guess == truth { c++ } n++ } } if iter%150 == 0 { p.perceptron.average() logf("Averaged perceptron") } if p.progress != nil { p.progress <- Progress{Iter: iter, Correct: c, Count: n, ShortCutted: shortcutted} } treebank.ShuffleSentenceTag(sentences) } p.perceptron.average() } // LoadShortcuts allows for domain specific things to be mapped into the tagger. func (p *Tagger) LoadShortcuts(shortcuts map[string]lingo.POSTag) { for shortcut, tags := range shortcuts { p.cachedTags[shortcut] = tags } } func (p *Tagger) fillCache(sentences []treebank.SentenceTag) { logf("Filling Cache with %d sentences", len(sentences)) var counter = make(map[string]map[lingo.POSTag]int) for _, sentenceTag := range sentences { s := sentenceTag.Sentence tags := sentenceTag.Tags for i, lex := range s { w := lex.Value t := tags[i] _, ok := counter[w] if !ok { counter[w] = make(map[lingo.POSTag]int) } counter[w][t]++ } } freqThresh := 30 ambiguityThresh := 0.98 for word, tagCounter := range counter { var maxTag lingo.POSTag var max int var n int for t, c := range tagCounter { if c > max { maxTag = t max = c } n += c } if n >= freqThresh && float64(max)/float64(n) >= ambiguityThresh { p.cachedTags[word] = maxTag } } } func (p *Tagger) shortcut(l lingo.Lexeme) (lingo.POSTag, bool) { tag, ok := lingo.POSTagShortcut(l) if !ok { tag, ok = p.cachedTags[l.Value] } return tag, ok } func (p *Tagger) setTag(a *lingo.Annotation, tag lingo.POSTag) { if a == lingo.NullAnnotation() || a == lingo.RootAnnotation() || a == lingo.StartAnnotation() { return } a.POSTag = tag if lemmas, err := p.Lemmatize(a.Value, tag); err == nil && len(lemmas) > 0 { // sort.Strings(lemmas) a.Lemma = lemmas[0] } if stem, err := p.Stem(a.Value); err == nil { a.Stem = stem } } // Progress is just a tuple of training progress info type Progress struct { Iter, Correct, Count, ShortCutted int } ================================================ FILE: pos/release.go ================================================ // +build !debug package pos const BUILD_DEBUG = "POS TAGGER: Release Build" var TABCOUNT uint32 = 0 var tracking = false func tabcount() int { return 0 } func enterLoggingContext() {} func leaveLoggingContext() {} func logf(format string, others ...interface{}) {} func recoverFrom(format string, attrs ...interface{}) {} func (p *Tagger) ShowWeights() {} func printShortcuts(p *Tagger) {} ================================================ FILE: pos/sentence.go ================================================ package pos import "github.com/chewxy/lingo" // "log" func (p *Tagger) getSentences() { defer close(p.sentences) var sentence lingo.AnnotatedSentence sentence = append(sentence, lingo.RootAnnotation()) for lexeme := range p.Input { if lexeme.LexemeType != lingo.EOF { a := lingo.NewAnnotation() a.Lexeme = lexeme if err := a.Process(p); err != nil { panic(err) // for now } sentence = append(sentence, a) } else { p.sentences <- sentence // reset sentence = lingo.AnnotatedSentence{lingo.RootAnnotation()} } // TODO: Sentence splitting } } ================================================ FILE: pos/test_test.go ================================================ package pos import ( "github.com/chewxy/lingo" "github.com/kljensen/snowball" ) type dummyLem struct{} func (dummyLem) Lemmatize(s string, pt lingo.POSTag) ([]string, error) { if len(s) > 3 { return []string{ s[:2], }, nil } return []string{""}, nil } type dummyStemmer struct{} func (dummyStemmer) Stem(s string) (string, error) { return snowball.Stem(s, "english", true) } var clusters = map[string]lingo.Cluster{ "TEst": 1, "Test": 1, "test": 1, } type dummyFix struct { dummyStemmer dummyLem } func (dummyFix) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil } const conllu = `1 From from ADP IN _ 3 case _ _ 2 the the DET DT Definite=Def|PronType=Art 3 det _ _ 3 AP AP PROPN NNP Number=Sing 4 nmod _ _ 4 comes come VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ 5 this this DET DT Number=Sing|PronType=Dem 6 det _ _ 6 story story NOUN NN Number=Sing 4 nsubj _ _ 7 : : PUNCT : _ 4 punct _ _ 1 President President PROPN NNP Number=Sing 2 compound _ _ 2 Bush Bush PROPN NNP Number=Sing 5 nsubj _ _ 3 on on ADP IN _ 4 case _ _ 4 Tuesday Tuesday PROPN NNP Number=Sing 5 nmod _ _ 5 nominated nominate VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ _ 6 two two NUM CD NumType=Card 7 nummod _ _ 7 individuals individual NOUN NNS Number=Plur 5 dobj _ _ 8 to to PART TO _ 9 mark _ _ 9 replace replace VERB VB VerbForm=Inf 5 advcl _ _ 10 retiring retire VERB VBG VerbForm=Ger 11 amod _ _ 11 jurists jurist NOUN NNS Number=Plur 9 dobj _ _ 12 on on ADP IN _ 14 case _ _ 13 federal federal ADJ JJ Degree=Pos 14 amod _ _ 14 courts court NOUN NNS Number=Plur 11 nmod _ _ 15 in in ADP IN _ 18 case _ _ 16 the the DET DT Definite=Def|PronType=Art 18 det _ _ 17 Washington Washington PROPN NNP Number=Sing 18 compound _ _ 18 area area NOUN NN Number=Sing 14 nmod _ _ 19 . . PUNCT . _ 5 punct _ _ 1 Bush Bush PROPN NNP Number=Sing 2 nsubj _ _ 2 nominated nominate VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ _ 3 Jennifer Jennifer PROPN NNP Number=Sing 5 compound _ _ 4 M. M. PROPN NNP Number=Sing 5 compound _ _ 5 Anderson Anderson PROPN NNP Number=Sing 2 dobj _ _ 6 for for ADP IN _ 11 case _ _ 7 a a DET DT Definite=Ind|PronType=Art 11 det _ _ 8 15 15 NUM CD NumType=Card 10 nummod _ _ 9 - - PUNCT HYPH _ 10 punct _ _ 10 year year NOUN NN Number=Sing 11 compound _ _ 11 term term NOUN NN Number=Sing 2 nmod _ _ 12 as as ADP IN _ 14 case _ _ 13 associate associate ADJ JJ Degree=Pos 14 amod _ _ 14 judge judge NOUN NN Number=Sing 11 nmod _ _ 15 of of ADP IN _ 18 case _ _ 16 the the DET DT Definite=Def|PronType=Art 18 det _ _ 17 Superior Superior PROPN NNP Number=Sing 18 compound _ _ 18 Court Court PROPN NNP Number=Sing 14 nmod _ _ 19 of of ADP IN _ 21 case _ _ 20 the the DET DT Definite=Def|PronType=Art 21 det _ _ 21 District District PROPN NNP Number=Sing 18 nmod _ _ 22 of of ADP IN _ 23 case _ _ 23 Columbia Columbia PROPN NNP Number=Sing 21 nmod _ _ 24 , , PUNCT , _ 2 punct _ _ 25 replacing replace VERB VBG VerbForm=Ger 2 advcl _ _ 26 Steffen Steffen PROPN NNP Number=Sing 28 compound _ _ 27 W. W. PROPN NNP Number=Sing 28 compound _ _ 28 Graae Graae PROPN NNP Number=Sing 25 dobj _ _ 29 . . PUNCT . _ 2 punct _ _ 1 We we PRON PRP Case=Nom|Number=Plur|Person=1|PronType=Prs 3 nsubj _ _ 2 've have AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 3 aux _ _ 3 grown grow VERB VBN Tense=Past|VerbForm=Part 0 root _ _ 4 up up ADP RP _ 3 compound:prt _ _ 5 . . PUNCT . _ 3 punct _ _` ================================================ FILE: pos/util.go ================================================ package pos import ( "math" "github.com/chewxy/lingo" ) func maxScore(scores *[lingo.MAXTAG]float64) lingo.POSTag { var maxClass lingo.POSTag maxVal := -math.MaxFloat64 for c, v := range scores { if v > maxVal { maxClass = lingo.POSTag(c) maxVal = v } } return maxClass } ================================================ FILE: pos/util_test.go ================================================ package pos import ( "math" "math/rand" "testing" "github.com/chewxy/lingo" ) func TestMaxScore(t *testing.T) { rand.Seed(1337) scores := new([lingo.MAXTAG]float64) for i := range scores { scores[i] = rand.Float64() if lingo.POSTag(i) == lingo.ROOT_TAG { scores[i] = math.MaxFloat64 } } tag := maxScore(scores) if tag != lingo.ROOT_TAG { t.Errorf("Expected Score #10 to be the max. Got %d instead", tag) } } ================================================ FILE: sentence.go ================================================ package lingo import ( "bytes" "fmt" "sort" "strings" "github.com/pkg/errors" ) /* Lexeme Sentence */ type LexemeSentence []Lexeme func NewLexemeSentence() LexemeSentence { return LexemeSentence(make([]Lexeme, 0)) } func (ls LexemeSentence) String() string { var buf bytes.Buffer for _, lex := range ls { buf.WriteString(lex.Value) buf.WriteString(" ") } return strings.Trim(buf.String(), " ") } /* Annotated Sentence */ // AnnotatedSentence is a sentence, but each word has been annotated. type AnnotatedSentence []*Annotation func NewAnnotatedSentence() AnnotatedSentence { return make(AnnotatedSentence, 0) } func (as AnnotatedSentence) Clone() AnnotatedSentence { retVal := make(AnnotatedSentence, len(as)) for i, a := range as { // don't clone rootAnnotation if i == 0 && a == rootAnnotation { retVal[i] = a continue } retVal[i] = a.Clone() } return retVal } func (as AnnotatedSentence) SetID() { for i, a := range as { if i == 0 && a == rootAnnotation { continue } a.ID = i } } func (as AnnotatedSentence) Fix() { if as[0].Lexeme == rootLexeme { as[0] = rootAnnotation } as.SetID() for _, a := range as { if a.Head != nil { if a.HeadID() == -1 && a.Head.Lexeme == rootLexeme { a.Head = rootAnnotation continue } a.SetHead(as[a.HeadID()]) } } } func (as AnnotatedSentence) IsValid() bool { // check that IDs are set zeroes := 0 for _, a := range as { if a.ID == 0 { zeroes++ } } // IDs not properly set if zeroes > 1 { return false } // TODO // check that there is only one root return true } /* Return slices of x */ // Phrase returns the slice of the sentence. While you can do the same by simply doing as[start:end], this method returns errors instead of panicking func (as AnnotatedSentence) Phrase(start, end int) (AnnotatedSentence, error) { if start < 0 { return nil, errors.Errorf("Start: %d < 0", start) } if end > len(as) { return nil, errors.Errorf("End: %d > len(as): %d", end, len(as)) } return as[start:end], nil } // IDs returns the list of IDs in the sentence. The return value has exactly the same length as the sentence. func (as AnnotatedSentence) IDs() []int { retVal := make([]int, len(as)) for i, a := range as { retVal[i] = a.ID } return retVal } // Tags returns the POSTags of the sentence. The return value has exactly the same length as the sentence. func (as AnnotatedSentence) Tags() []POSTag { retVal := make([]POSTag, len(as)) for i, a := range as { retVal[i] = a.POSTag } return retVal } // Heads returns the head IDs of the sentence. The return value has exactly the same length as the sentence. func (as AnnotatedSentence) Heads() []int { retVal := make([]int, len(as)) for i, a := range as { retVal[i] = a.HeadID() } return retVal } // Leaves returns the *Annotations which are leaves. If the dependency hasn't been set yet, every single *Annotation is a leaf. func (as AnnotatedSentence) Leaves() (retVal []int) { for i := range as { if len(as.Children(i)) == 0 { retVal = append(retVal, i) } } return } // Labels returns the DependencyTypes of the sentence. The return value has exactly the same length as the sentence. func (as AnnotatedSentence) Labels() []DependencyType { retVal := make([]DependencyType, len(as)) for i, a := range as { retVal[i] = a.DependencyType } return retVal } // StringSlice returns the original words as a slice of string. The return value has exactly the same length as the sentence. func (as AnnotatedSentence) StringSlice() []string { retVal := make([]string, len(as), len(as)) for i, a := range as { retVal[i] = a.Value } return retVal } // LoweredStringSlice returns the lowercased version of the words in the sentence as a slice of string. The return value has exactly the same length as the sentence. func (as AnnotatedSentence) LoweredStringSlice() []string { retVal := make([]string, len(as), len(as)) for i, a := range as { retVal[i] = a.Lowered } return retVal } // Lemmas returns the lemmas as as slice of string. The return value has exactly the same length as the sentence. func (as AnnotatedSentence) Lemmas() []string { lemmas := make([]string, len(as)) for i, a := range as { lemmas[i] = a.Lemma } return lemmas } // Stems returns the stems as a slice of string. The return value has exactly the same length as the sentence. func (as AnnotatedSentence) Stems() []string { stems := make([]string, len(as)) for i, a := range as { stems[i] = a.Stem } return stems } func (as AnnotatedSentence) Children(h int) (retVal []int) { for i, v := range as { if v.HeadID() == h { retVal = append(retVal, i) } } return } func (as AnnotatedSentence) Edges() (retVal []DependencyEdge) { for _, a := range as { var head = -1 if a.Head != nil { head = a.HeadID() } if head == -1 { head = 0 } edge := DependencyEdge{as[head], a, a.DependencyType} retVal = append(retVal, edge) } sort.Sort(edgeByID(retVal)) return } /* To other structures */ func (as AnnotatedSentence) Dependency() *Dependency { return NewDependency(FromAnnotatedSentence(as)) } func (as AnnotatedSentence) Tree() *DependencyTree { tracker := make([]*DependencyTree, len(as)) rootNode := NewDependencyTree(nil, 0, rootAnnotation) tracker[0] = rootNode for i := 1; i < len(as); i++ { head := as[i].HeadID() var headDep *DependencyTree if head == -1 { headDep = rootNode } else { headDep = tracker[head] } if headDep == nil { // make a dependency for the head headDep = NewDependencyTree(nil, head, as[head]) tracker[head] = headDep } dep := tracker[i] if dep == nil { dep = NewDependencyTree(headDep, i, as[i]) tracker[i] = dep } else { dep.Parent = headDep } headDep.AddChild(dep) dep.Type = as[i].DependencyType } // return tracker[len(tracker)-1] // log.Printf("Tracker: %v, len(as): %d. Root: %v", tracker, len(as), rootNode.Children) return rootNode } // Stringer interface func (as AnnotatedSentence) String() string { var buf bytes.Buffer for i, a := range as { buf.WriteString(fmt.Sprintf("%s/%s", a.Value, a.POSTag)) if i < len(as)-1 { buf.WriteString(" ") } } return buf.String() } func (as AnnotatedSentence) ValueString() string { var buf bytes.Buffer for i, a := range as { buf.WriteString(a.Value) if i < len(as)-1 { buf.WriteString(" ") } } return buf.String() } func (as AnnotatedSentence) LoweredString() string { var buf bytes.Buffer for i, a := range as { buf.WriteString(a.Lowered) if i < len(as)-1 { buf.WriteString(" ") } } return buf.String() } func (as AnnotatedSentence) LemmaString() string { var buf bytes.Buffer for i, a := range as { buf.WriteString(a.Lemma) if i < len(as)-1 { buf.WriteString(" ") } } return buf.String() } func (as AnnotatedSentence) StemString() string { var buf bytes.Buffer for i, a := range as { buf.WriteString(a.Stem) if i < len(as)-1 { buf.WriteString(" ") } } return buf.String() } // sort interface func (as AnnotatedSentence) Len() int { return len(as) } func (as AnnotatedSentence) Swap(i, j int) { as[i], as[j] = as[j], as[i] } func (as AnnotatedSentence) Less(i, j int) bool { return as[i].ID < as[j].ID } ================================================ FILE: sets.go ================================================ package lingo import ( "bytes" "fmt" ) /* TAG SET */ // TagSet is a set of all the POSTags type TagSet [MAXTAG]bool func (ts TagSet) String() string { var buf bytes.Buffer for t, v := range ts { buf.WriteString(fmt.Sprintf("%v: %v\n", POSTag(t), v)) } return buf.String() } // DependencyTypeSet is a set of all the DependencyTypes type DependencyTypeSet [MAXDEPTYPE]bool func (dts DependencyTypeSet) String() string { var buf bytes.Buffer for t, v := range dts { buf.WriteString(fmt.Sprintf("%v: %v\n", DependencyType(t), v)) } return buf.String() } ================================================ FILE: shape.go ================================================ package lingo import ( "bytes" "unicode" ) // Shape represents the shape of a word. It's currently implemented as a string type Shape string func (l Lexeme) Shape() Shape { s := l.Value if len(s) > 50 { return Shape("Long") } var buf bytes.Buffer previousCharShape := ' ' currentCharShape := ' ' sequence := 0 for _, c := range s { switch { case unicode.IsLetter(c): if unicode.IsUpper(c) { currentCharShape = 'X' } else { currentCharShape = 'x' } case unicode.IsDigit(c): currentCharShape = 'd' case l.LexemeType == URI: return Shape("URI") default: currentCharShape = c } if previousCharShape == currentCharShape { sequence++ } else { sequence = 0 // reset the sequence previousCharShape = currentCharShape } if sequence < 4 { buf.WriteRune(currentCharShape) } } retVal := buf.String() return Shape(retVal) } ================================================ FILE: stopwords.go ================================================ package lingo import "strings" const sw = `a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can cannot cant co computer con could couldnt cry de describe detail did didn do does doesn doing don done down due during each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen fify fill find fire first five for former formerly forty found four from front full further get give go had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie if in inc indeed interest into is it its itself just keep kg km last latter latterly least less ltd made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off often on once one only onto or other others otherwise our ours ourselves out over own part per perhaps please put quite rather re really regarding same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under unless until up upon us used using various very via was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves` var stopwords = make(map[string]struct{}) func init() { for _, s := range strings.Split(sw, " ") { stopwords[s] = empty } } var specials = `-ROOT- -UNKNOWN-` func UnescapeSpecials(word string) string { switch word { case "-LRB-": return "(" case "-RRB-": return ")" case "``": return "\"" case "-NULL-": return "" } return word } ================================================ FILE: treebank/const_postag_stanford.go ================================================ // +build stanfordtags package treebank import "github.com/chewxy/lingo" var posTagTable map[string]lingo.POSTag = map[string]lingo.POSTag{ "X": lingo.X, "CC": lingo.CC, "CD": lingo.CD, "DT": lingo.DT, "EX": lingo.EX, "FW": lingo.FW, "IN": lingo.IN, "JJ": lingo.JJ, "JJR": lingo.JJR, "JJS": lingo.JJS, "LS": lingo.LS, "MD": lingo.MD, "NN": lingo.NN, "NNS": lingo.NNS, "NNP": lingo.NNP, "NNPS": lingo.NNPS, "PDT": lingo.PDT, "POS": lingo.POS, "PRP": lingo.PRP, "PPRP": lingo.PPRP, "PRP$": lingo.PPRP, "RB": lingo.RB, "RBR": lingo.RBR, "RBS": lingo.RBS, "RP": lingo.RP, "SYM": lingo.SYM, "TO": lingo.TO, "UH": lingo.UH, "VB": lingo.VB, "VBD": lingo.VBD, "VBG": lingo.VBG, "VBN": lingo.VBN, "VBP": lingo.VBP, "VBZ": lingo.VBZ, "WDT": lingo.WDT, "WP": lingo.WP, "PWP": lingo.PWP, "WP$": lingo.PWP, "WRB": lingo.WRB, // punctuation ",": lingo.COMMA, "``": lingo.OPENQUOTE, "''": lingo.CLOSEQUOTE, ".": lingo.FULLSTOP, ":": lingo.COLON, "$": lingo.DOLLAR, "#": lingo.HASHSIGN, "-LRB-": lingo.LEFTBRACE, "-RRB-": lingo.RIGHTBRACE, "ADD": lingo.ADD, "NFP": lingo.NFP, "HYPH": lingo.HYPH, "GW": lingo.GW, "AFX": lingo.AFX, "XX": lingo.XX, "-NULL-": lingo.X, "-ROOT-": lingo.ROOT_TAG, "-UNKNOWN-": lingo.UNKNOWN_TAG, } ================================================ FILE: treebank/const_postag_universal.go ================================================ // +build !stanfordtags package treebank import "github.com/chewxy/lingo" var posTagTable map[string]lingo.POSTag = map[string]lingo.POSTag{ "X": lingo.X, "ADJ": lingo.ADJ, "ADP": lingo.ADP, "ADV": lingo.ADV, "AUX": lingo.AUX, "CONJ": lingo.CONJ, "DET": lingo.DET, "INTJ": lingo.INTJ, "NOUN": lingo.NOUN, "NUM": lingo.NUM, "PART": lingo.PART, "PRON": lingo.PRON, "PROPN": lingo.PROPN, "PUNCT": lingo.PUNCT, "SCONJ": lingo.SCONJ, "SYM": lingo.SYM, "VERB": lingo.VERB, "-NULL-": lingo.X, "-ROOT-": lingo.ROOT_TAG, "-UNKNOWN-": lingo.UNKNOWN_TAG, } ================================================ FILE: treebank/const_rel_stanford.go ================================================ // +build stanfordrel package treebank import "github.com/chewxy/lingo" var dependencyTable map[string]lingo.DependencyType = map[string]lingo.DependencyType{ "root": lingo.Root, "dep": lingo.Dep, "aux": lingo.Aux, "auxpass": lingo.AuxPass, "cop": lingo.Cop, "arg": lingo.Arg, "agent": lingo.Agent, "comp": lingo.Comp, "acomp": lingo.AComp, "ccomp": lingo.CComp, "xcomp": lingo.XComp, "obj": lingo.Obj, "dobj": lingo.DObj, "iobj": lingo.IObj, "pobj": lingo.PObj, "subj": lingo.Subj, "nsubj": lingo.NSubj, "nsubjpass": lingo.NSubjPass, "csubj": lingo.CSubj, "csubjpass": lingo.CSubjPass, "cc": lingo.Coordination, "conj": lingo.Conj, "expl": lingo.Expl, "mod": lingo.Mod, "amod": lingo.AMod, "appos": lingo.Appos, "advcl": lingo.Advcl, "det": lingo.Det, "predet": lingo.Predet, "preconj": lingo.Preconj, "vmod": lingo.Vmod, "mwe": lingo.MWE, "mark": lingo.Mark, "advmod": lingo.AdvMod, "neg": lingo.Neg, "rcmod": lingo.RCMod, "quantmod": lingo.QuantMod, "nn": lingo.NounMod, "npadvmod": lingo.NPAdvMod, "tmod": lingo.TMod, "num": lingo.Num, "number": lingo.NumberElement, "prep": lingo.Prep, "poss": lingo.Poss, "possessive": lingo.Possessive, "prt": lingo.PRT, "parataxis": lingo.Parataxis, "goeswith": lingo.GoesWith, "punct": lingo.Punct, "ref": lingo.Ref, "sdep": lingo.SDep, "xsubj": lingo.XSubj, // additional stuff not found in the original, but found in EWT "case": lingo.Case, "compound": lingo.Compound, "nmod": lingo.NMod, "discourse": lingo.Discourse, "nummod": lingo.NumMod, "relcl": lingo.RelCl, "nfincl": lingo.NFinCl, "nmod:poss": lingo.NMod_Poss, "nmod:npmod": lingo.NMod_NPMod, "vocative": lingo.Vocative, "list": lingo.List, "mwprep": lingo.MWPrep, "remnant": lingo.Remnant, "acl": lingo.Acl, "npmod": lingo.NPMod, "mdvod": lingo.MDVod, "detmod": lingo.DetMod, // found in NNParser "pcomp": lingo.PComp, "-NULL-": lingo.Dep, } ================================================ FILE: treebank/const_rel_universal.go ================================================ // +build !stanfordrel package treebank import "github.com/chewxy/lingo" var dependencyTable map[string]lingo.DependencyType = map[string]lingo.DependencyType{ "dep": lingo.Dep, "root": lingo.Root, "nsubj": lingo.NSubj, "nsubjpass": lingo.NSubjPass, "dobj": lingo.DObj, "iobj": lingo.IObj, "csubj": lingo.CSubj, "csubjpass": lingo.CSubjPass, "ccomp": lingo.CComp, "xcomp": lingo.XComp, "nummod": lingo.NumMod, "appos": lingo.Appos, "nmod": lingo.NMod, "acl": lingo.ACl, "acl:relcl": lingo.ACl_RelCl, "det": lingo.Det, "det:predet": lingo.Det_PreDet, "amod": lingo.AMod, "neg": lingo.Neg, "case": lingo.Case, "nmod:npmod": lingo.NMod_NPMod, "nmod:tmod": lingo.NMod_TMod, "nmod:poss": lingo.NMod_Poss, "advcl": lingo.AdvCl, "advmod": lingo.AdvMod, "compound": lingo.Compound, "compound:prt": lingo.Compound_Part, "name": lingo.Name, "mwe": lingo.MWE, "foreign": lingo.Foreign, "goeswith": lingo.GoesWith, "list": lingo.List, "dislocated": lingo.Dislocated, "parataxis": lingo.Parataxis, "remnant": lingo.Remnant, "reparandum": lingo.Reparandum, "vocative": lingo.Vocative, "discourse": lingo.Discourse, "expl": lingo.Expl, "aux": lingo.Aux, "auxpass": lingo.AuxPass, "cop": lingo.Cop, "mark": lingo.Mark, "punct": lingo.Punct, "conj": lingo.Conj, "cc": lingo.Coordination, "cc:preconj": lingo.CC_PreConj, // https://github.com/UniversalDependencies/docs/issues/221 "conj:preconj": lingo.CC_PreConj, // https://github.com/UniversalDependencies/docs/issues/221 "-NULL-": lingo.NoDepType, } ================================================ FILE: treebank/sentenceTag.go ================================================ package treebank import ( "math/rand" "github.com/chewxy/lingo" ) // SentenceTag is a struc that holds a sentence, tags, heads and labels type SentenceTag struct { Sentence lingo.LexemeSentence Tags []lingo.POSTag Heads []int Labels []lingo.DependencyType } func (s SentenceTag) AnnotatedSentence(f lingo.AnnotationFixer) lingo.AnnotatedSentence { retVal := lingo.NewAnnotatedSentence() retVal = append(retVal, lingo.RootAnnotation()) for i, lex := range s.Sentence { a := lingo.NewAnnotation() a.Lexeme = lex a.POSTag = s.Tags[i] a.DependencyType = s.Labels[i] // should panic, because SentenceTag is only ever used during training if err := a.Process(f); err != nil { panic(err) } retVal = append(retVal, a) } // add heads for i, a := range retVal { if i == 0 { continue } a.SetHead(retVal[s.Heads[i-1]]) } retVal.Fix() return retVal } func (s SentenceTag) Dependency(f lingo.AnnotationFixer) *lingo.Dependency { sentence := s.AnnotatedSentence(f) dep := sentence.Dependency() return dep } func (s SentenceTag) String() string { return s.Sentence.String() } func ShuffleSentenceTag(s []SentenceTag) []SentenceTag { rand.Seed(1337) for i := range s { j := rand.Intn(i + 1) s[i], s[j] = s[j], s[i] } return s } /* UTILITY FUNCTIONS */ func WrapLexemeSentence(sentence lingo.LexemeSentence) lingo.LexemeSentence { retSentence := lingo.NewLexemeSentence() retSentence = append(retSentence, lingo.StartLexeme()) retSentence = append(retSentence, sentence...) retSentence = append(retSentence, lingo.RootLexeme()) return retSentence } func WrapTags(tagList []lingo.POSTag) []lingo.POSTag { retVal := append([]lingo.POSTag{lingo.X}, tagList...) retVal = append(retVal, lingo.X) return retVal } func WrapHeads(heads []int) []int { retVal := append([]int{0}, heads...) retVal = append(retVal, 0) return retVal } func WrapDeps(deps []lingo.DependencyType) []lingo.DependencyType { retVal := append([]lingo.DependencyType{lingo.Dep}, deps...) retVal = append(retVal, lingo.Dep) return retVal } ================================================ FILE: treebank/sentenceTag_test.go ================================================ package treebank import ( "strings" "testing" "github.com/stretchr/testify/assert" ) func TestSentenceTag(t *testing.T) { assert := assert.New(t) readr := strings.NewReader(sampleConllu) st := ReadConllu(readr)[0] correctHeads := []int{2, 5, 4, 5, 0, 7, 5, 9, 5, 11, 9, 14, 14, 11, 18, 18, 18, 14, 5} assert.Equal(correctHeads, st.Heads) dep := st.Dependency(nil) assert.Equal(correctHeads, dep.Heads()[1:]) } ================================================ FILE: treebank/treebank.go ================================================ package treebank import ( "archive/zip" "io" "log" "github.com/chewxy/lingo" "bufio" "os" "strconv" "strings" ) var empty struct{} // Loader is anything that loads into a slice of SentenceTags. For future uses, to load tree banks type Loader func(string) []SentenceTag // LoadUniversal loads a treebank file formatted in a CONLLU format func LoadUniversal(fileName string) []SentenceTag { f, err := os.Open(fileName) if err != nil { log.Printf("filename %q", fileName) panic(err) } defer f.Close() return ReadConllu(f) } // ReadConllu reads a file formatted in a CONLLU format func ReadConllu(reader io.Reader) []SentenceTag { s, st, sh, sdt := reset() sentences := make([]SentenceTag, 0) sentenceCount := 0 var usedTags lingo.TagSet var usedDepTypes lingo.DependencyTypeSet var unknownTags = make(map[string]struct{}) var unknownDepType = make(map[string]struct{}) colCount := 0 for bs := bufio.NewScanner(reader); bs.Scan(); colCount++ { l := bs.Text() if strings.HasPrefix(l, "#") { // comments continue } if len(l) == 0 { // then this is a new sentence sentences = finish(s, st, sh, sdt, sentences) s, st, sh, sdt = reset() sentenceCount++ continue } cols := strings.Split(l, "\t") word := cols[1] var tag string switch lingo.BUILD_TAGSET { case "stanfordtags": tag = cols[4] case "universaltags": tag = cols[3] default: panic("Unknown tagset") } head := cols[6] depType := cols[7] var t lingo.POSTag var dt lingo.DependencyType var h int var ok bool var err error word = lingo.UnescapeSpecials(word) lexType := StringToLexType(tag) if t, ok = StringToPOSTag(tag); ok { usedTags[t] = true } else { unknownTags[tag] = empty } if h, err = strconv.Atoi(head); err != nil { panic(err) // panic is the right option, because there is no default } if dt, ok = StringToDependencyType(depType); ok { usedDepTypes[dt] = true } else { unknownDepType[depType] = empty } lexeme := lingo.Lexeme{word, lexType, sentenceCount, colCount, 0} // TODO: add byte offset s = append(s, lexeme) st = append(st, t) sh = append(sh, h) sdt = append(sdt, dt) } return sentences } // LoadEWT loads a zipped English Web Treebank (as donated by Google) func LoadEWT(filename string) []SentenceTag { r, err := zip.OpenReader(filename) if err != nil { panic(err) } defer r.Close() sentences := make([]SentenceTag, 0) for _, f := range r.File { contents, err := f.Open() if err != nil { panic(err) } sentences = append(sentences, ReadConllu(contents)...) contents.Close() } return sentences } ================================================ FILE: treebank/treebank_test.go ================================================ package treebank import ( "strings" "testing" "github.com/chewxy/lingo" "github.com/stretchr/testify/assert" ) const sampleConllu = `1 President President PROPN NNP Number=Sing 2 compound _ _ 2 Bush Bush PROPN NNP Number=Sing 5 nsubj _ _ 3 on on ADP IN _ 4 case _ _ 4 Tuesday Tuesday PROPN NNP Number=Sing 5 nmod _ _ 5 nominated nominate VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ _ 6 two two NUM CD NumType=Card 7 nummod _ _ 7 individuals individual NOUN NNS Number=Plur 5 dobj _ _ 8 to to PART TO _ 9 mark _ _ 9 replace replace VERB VB VerbForm=Inf 5 advcl _ _ 10 retiring retire VERB VBG VerbForm=Ger 11 amod _ _ 11 jurists jurist NOUN NNS Number=Plur 9 dobj _ _ 12 on on ADP IN _ 14 case _ _ 13 federal federal ADJ JJ Degree=Pos 14 amod _ _ 14 courts court NOUN NNS Number=Plur 11 nmod _ _ 15 in in ADP IN _ 18 case _ _ 16 the the DET DT Definite=Def|PronType=Art 18 det _ _ 17 Washington Washington PROPN NNP Number=Sing 18 compound _ _ 18 area area NOUN NN Number=Sing 14 nmod _ _ 19 . . PUNCT . _ 5 punct _ _ ` func Test_ReadConllu(t *testing.T) { assert := assert.New(t) st := ReadConllu(strings.NewReader(sampleConllu))[0] correctHeads := []int{2, 5, 4, 5, 0, 7, 5, 9, 5, 11, 9, 14, 14, 11, 18, 18, 18, 14, 5} assert.Equal(correctHeads, st.Heads) // we compare by string to avoid having to build two different test files var correctPOS []string if lingo.BUILD_TAGSET == "stanfordtags" { correctPOS = []string{ "NNP", "NNP", "IN", "NNP", "VBD", "CD", "NNS", "TO", "VB", "VBG", "NNS", "IN", "JJ", "NNS", "IN", "DT", "NNP", "NN", "FULLSTOP", } } else { correctPOS = []string{ "PROPN", "PROPN", "ADP", "PROPN", "VERB", "NUM", "NOUN", "PART", "VERB", "VERB", "NOUN", "ADP", "ADJ", "NOUN", "ADP", "DET", "PROPN", "NOUN", "PUNCT", } } assert.Equal(correctPOS, ttos(st.Tags)) // the stanford tags are not listed in the CONLLU format if lingo.BUILD_RELSET != "stanfordrel" { var correctRel []string correctRel = []string{ "Compound", "NSubj", "Case", "NMod", "Root", "NumMod", "DObj", "Mark", "AdvCl", "AMod", "DObj", "Case", "AMod", "NMod", "Case", "Det", "Compound", "NMod", "Punct", } assert.Equal(correctRel, ltos(st.Labels)) } } func ttos(ts []lingo.POSTag) []string { retVal := make([]string, len(ts)) for i, t := range ts { retVal[i] = t.String() } return retVal } func ltos(ls []lingo.DependencyType) []string { retVal := make([]string, len(ls)) for i, l := range ls { retVal[i] = l.String() } return retVal } ================================================ FILE: treebank/util.go ================================================ package treebank import "github.com/chewxy/lingo" var alreadyLogged map[string]bool = make(map[string]bool) // TODO : CHECK func StringToLexType(tag string) lingo.LexemeType { var lexType lingo.LexemeType switch tag { case "NUM": lexType = lingo.Number case "PUNCT": lexType = lingo.Punctuation case "SYM": lexType = lingo.Symbol default: lexType = lingo.Word } return lexType } func StringToPOSTag(tag string) (lingo.POSTag, bool) { t, ok := posTagTable[tag] return t, ok } func StringToDependencyType(ud string) (lingo.DependencyType, bool) { dt, ok := dependencyTable[ud] return dt, ok } func reset() (lingo.LexemeSentence, []lingo.POSTag, []int, []lingo.DependencyType) { s := lingo.NewLexemeSentence() st := make([]lingo.POSTag, 0) sh := make([]int, 0) sdt := make([]lingo.DependencyType, 0) return s, st, sh, sdt } func finish(s lingo.LexemeSentence, st []lingo.POSTag, sh []int, sdt []lingo.DependencyType, sentences []SentenceTag) []SentenceTag { sentenceTag := SentenceTag{s, st, sh, sdt} sentences = append(sentences, sentenceTag) return sentences } ================================================ FILE: utils.go ================================================ package lingo func InStringSlice(s string, l []string) bool { for _, v := range l { if s == v { return true } } return false } type is func(rune) bool func StringIs(s string, f is) bool { for _, c := range s { if !f(c) { return false } } return true } func isAscii(r rune) bool { if r > 255 { return false } return true } func EqStringSlice(a, b []string) bool { if len(a) != len(b) { return false } for i, v := range a { if v != b[i] { return false } } return true } ================================================ FILE: wordFlags.go ================================================ package lingo import ( "fmt" "strings" "unicode" ) // WordFlags represent the types a word may be. A word may have multiple flags type WordFlag uint32 const ( NoFlag WordFlag = iota IsLetter IsAscii IsDigit IsLower IsPunct IsSpace IsTitle IsUpper LikeURL LikeNum LikeEmail IsStopWord IsOOV // for ner MAXFLAG ) func (f WordFlag) String() string { return fmt.Sprintf("%014b", f) } func (l Lexeme) Flags() WordFlag { var wf WordFlag s := l.Value if StringIs(s, unicode.IsLetter) { wf |= (1 << IsLetter) } if StringIs(s, unicode.IsDigit) { wf |= (1 << IsDigit) } if StringIs(s, isAscii) { wf |= (1 << IsAscii) } if StringIs(s, unicode.IsLower) { wf |= (1 << IsLower) } if StringIs(s, unicode.IsPunct) { wf |= (1 << IsPunct) } if StringIs(s, unicode.IsSpace) { wf |= (1 << IsSpace) } if StringIs(s, unicode.IsUpper) { wf |= (1 << IsUpper) } if l.LexemeType == URI { wf |= (1 << LikeURL) } if _, ok := NumberWords[strings.ToLower(s)]; ok { wf |= (1 << LikeNum) } if _, ok := stopwords[s]; ok { wf |= (1 << IsStopWord) } if len(s) > 0 { if (unicode.IsUpper(rune(s[0])) || unicode.IsTitle(rune(s[0]))) && StringIs(s[1:], unicode.IsLower) { wf |= (1 << IsTitle) } } return wf }