[
  {
    "path": ".gitignore",
    "content": "# Compiled Object files, Static and Dynamic libs (Shared Objects)\n*.o\n*.a\n*.so\n\n# Folders\n_obj\n_test\n\n# Architecture specific extensions/prefixes\n*.[568vq]\n[568vq].out\n\n*.cgo1.go\n*.cgo2.c\n_cgo_defun.c\n_cgo_gotypes.go\n_cgo_export.*\n\n_testmain.go\n\n*.exe\n*.test\n*.prof\n"
  },
  {
    "path": ".travis.yml",
    "content": "language: go\n\nbranches:\n  only:\n    - master\n\ngo:\n  - 1.11.x\n  - 1.12.x\n  - 1.13.x\n  - tip\n\nenv:\n  - GO111MODULE=on\n\nmatrix:\n  allow_failures:\n    - go: tip\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing #\n\nContributors are welcome! We want to make contributing as easy as possible, and the process is very Github-centric. [Github Issues](https://github.com/chewxy/lingo/issues) are used to manage any contributions and changes. If you don't have a github account, please feel free to email me (my  user name [at] gmail.com), and I'll gladly open an issue on your behalf.\n\n# Process #\n\nSay you have a change you want to make, this is the process:\n\n1. Open an issue.\n2. I'll have a brief discussion with you. If you don't feel comfortable with a public discussion, I'm okay to email. \n3. Fork this project on Github, and clone it to your local machine.\n4. Make your changes\n5. Make sure you have tests. If you foresee breaking any API, it is vital that it be discussed beforehand.\n6. Make sure your tests pass.\n7. `gofmt` your code\n8. Send a Pull Request.\n\nSay you instead saw one of the [many issues](https://github.com/chewxy/lingo/issues) and want to solve one of them. This is the process:\n\n1. Comment on the issue saying you'll pick it up. (Alternatively, email me)\n2. Fork the project on Github, clone to your local drive.\n3. Fork this project on Github, and clone it to your local machine.\n4. Make your changes\n5. Make sure you have tests. If you foresee breaking any API, it is vital that it be discussed beforehand.\n6. Make sure your tests pass.\n7. `gofmt` your code\n8. Send a Pull Request.\n\n## Pull Requests ##\n\nI'll review every pull request. I may request some changes, or delve into further discussions. After that, once I'm satisfied everything passes, I'll merge the pull request. Then I'll add your name into the CONTRIBUTORS list.\n\n# Debugging #\n\nThis package comes with a debug tag option. Most subpackages will have a `debug.go` which contain a `logf` function for logging any traces you wish to trace. "
  },
  {
    "path": "CONTRIBUTORS.md",
    "content": "# Contributors #\n\n* Xuanyi Chew (@chewxy) - initial package"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2017 Chewxy\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "POSTag.go",
    "content": "package lingo\n\nimport (\n\t\"fmt\"\n\t\"strings\"\n)\n\n// POSTag represents a Part of Speech Tag.\ntype POSTag byte\n\nvar posTagLookup map[string]POSTag\n\nfunc init() {\n\tposTagLookup = make(map[string]POSTag)\n\tfor t := X; t < MAXTAG; t++ {\n\t\ts := t.String()\n\t\tposTagLookup[s] = POSTag(t)\n\t\tposTagLookup[strings.ToLower(s)] = POSTag(t)\n\t}\n}\n\nfunc (p POSTag) MarshalText() ([]byte, error) {\n\treturn []byte(fmt.Sprintf(\"%v\", p)), nil // add quotes back\n}\n\nfunc (p *POSTag) UnmarshalText(text []byte) error {\n\tstr := strings.Trim(string(text), `\"`) // for JSON use, if any\n\ttag, _ := posTagLookup[str]\n\t*p = tag\n\treturn nil\n}\n\n// POSTag related functions\nfunc InPOSTags(x POSTag, set []POSTag) bool {\n\tfor _, v := range set {\n\t\tif v == x {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\nfunc IsAdjective(x POSTag) bool     { return InPOSTags(x, Adjectives) }\nfunc IsNoun(x POSTag) bool          { return InPOSTags(x, Nouns) }\nfunc IsProperNoun(x POSTag) bool    { return InPOSTags(x, ProperNouns) }\nfunc IsVerb(x POSTag) bool          { return InPOSTags(x, Verbs) }\nfunc IsAdverb(x POSTag) bool        { return InPOSTags(x, Adverbs) }\nfunc IsInterrogative(x POSTag) bool { return InPOSTags(x, Interrogatives) }\nfunc IsDeterminer(x POSTag) bool    { return InPOSTags(x, Determiners) }\nfunc IsNumber(x POSTag) bool        { return InPOSTags(x, Numbers) }\nfunc IsSymbol(x POSTag) bool        { return InPOSTags(x, Symbols) }\n"
  },
  {
    "path": "POSTag_stanford.go",
    "content": "// +build stanfordtags\n\npackage lingo\n\n//go:generate stringer -type=POSTag -output=POSTag_stanford_string.go\n\nconst BUILD_TAGSET = \"stanfordtags\"\n\nconst (\n\tX           POSTag = iota // aka NULLTAG\n\tUNKNOWN_TAG               // Unknown\n\tROOT_TAG                  // For Root\n\tCC                        // Coordinating conjunction\n\tCD                        // Cardinal number\n\tDT                        // Determiner\n\tEX                        // Existential there\n\tFW                        // Foreign word\n\tIN                        // Preposition or subordinating conjunction\n\tJJ                        // Adjective\n\tJJR                       // Adjective, comparative\n\tJJS                       // Adjective, superlative\n\tLS                        // List item marker\n\tMD                        // Modal\n\tNN                        // Noun, singular or mass\n\tNNS                       // Noun, plural\n\tNNP                       // Proper noun, singular\n\tNNPS                      // Proper noun, plural\n\tPDT                       // Predeterminer\n\tPOS                       // Possessive ending\n\tPRP                       // Personal pronoun\n\tPPRP                      // Possessive pronoun (PRP$)\n\tRB                        // Adverb\n\tRBR                       // Adverb, comparative\n\tRBS                       // Adverb, superlative\n\tRP                        // Particle\n\tSYM                       // Symbol\n\tTO                        // to\n\tUH                        // Interjection\n\tVB                        // Verb, base form\n\tVBD                       // Verb, past tense\n\tVBG                       // Verb, gerund or present participle\n\tVBN                       // Verb, past participle\n\tVBP                       // Verb, non-3rd person singular present\n\tVBZ                       // Verb, 3rd person singular present\n\tWDT                       // Wh-determiner\n\tWP                        // Wh-pronoun\n\tPWP                       // Possessive wh-pronoun (WP$)\n\tWRB                       // Wh-adverb\n\n\t// Punctuation related stuff: http://stackoverflow.com/a/21546294\n\tCOMMA      // Obvious isn't it?\n\tFULLSTOP   // fullstop\n\tOPENQUOTE  // Penn Treebank uses ``\n\tCLOSEQUOTE // Penn Treebank uses ''\n\tCOLON\n\tDOLLAR\n\tHASHSIGN\n\tLEFTBRACE\n\tRIGHTBRACE\n\n\t// Extensions for web shit: https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/etb-supplementary-guidelines-2009-addendum.pdf\n\t// http://clear.colorado.edu/compsem/documents/treebank_guidelines.pdf\n\tHYPH // Hyphen in split compounds\n\tAFX  // affix\n\tADD  // url or email addy\n\tNFP  // superfluous (non final) puncutation\n\tGW   // Goes WIth\n\tXX   // deidentified data (aka giberish)\n\n\tMAXTAG\n)\n\n// POSTagShortcut is a shortcut function to help the POSTagger shortcircuit some decisions about what the tag is\nfunc POSTagShortcut(l Lexeme) (POSTag, bool) {\n\tswitch l.LexemeType {\n\tcase Number:\n\t\treturn CD, true\n\tcase Punctuation:\n\t\tswitch l.Value {\n\t\tcase \",\":\n\t\t\treturn COMMA, true\n\t\tcase \".\":\n\t\t\treturn FULLSTOP, true\n\t\tcase \"``\":\n\t\t\treturn OPENQUOTE, true\n\t\tcase \"''\":\n\t\t\treturn CLOSEQUOTE, true\n\t\tcase \":\":\n\t\t\treturn COLON, true\n\t\tcase \"#\":\n\t\t\treturn HASHSIGN, true\n\t\tcase \"(\":\n\t\t\treturn LEFTBRACE, true\n\t\tcase \")\":\n\t\t\treturn RIGHTBRACE, true\n\t\tdefault:\n\t\t\treturn X, false\n\t\t}\n\tcase Symbol:\n\t\treturn SYM, true\n\tcase URI:\n\t\treturn ADD, true\n\tcase Date:\n\t\treturn CD, true\n\tcase Time:\n\t\treturn CD, true\n\tcase EOF:\n\t\treturn X, true\n\t}\n\treturn X, false\n}\n\n// sets\n\nvar Adjectives = []POSTag{JJ, JJR, JJS}\nvar Nouns = []POSTag{NN, NNP, NNS, NNPS}\nvar ProperNouns = []POSTag{NNP, NNPS}\nvar Verbs = []POSTag{VB, VBD, VBG, VBN, VBP, VBZ}\nvar Adverbs = []POSTag{RB, RBR, RBS}\nvar Determiners = []POSTag{DT, PDT}\nvar Interrogatives = []POSTag{WDT, WP, PWP, WRB}\nvar Numbers = []POSTag{CD}\nvar Symbols = []POSTag{SYM, FULLSTOP, COMMA, OPENQUOTE, COLON, DOLLAR, HASHSIGN, LEFTBRACE, RIGHTBRACE, HYPH, NFP}\n\n// IsIN returns true if the POSTag is a subordinating conjunction.\n// The reason why this exists is because in the stanford tag, IN is the POSTag\n// while in the universal dependencies, it's the SCONJ POSTag\nfunc IsIN(x POSTag) bool { return x == IN }\n"
  },
  {
    "path": "POSTag_stanford_string.go",
    "content": "// +build stanfordtags\n\n// Code generated by \"stringer -type=POSTag -output=POSTag_stanford_string.go\"; DO NOT EDIT\n\npackage lingo\n\nimport \"fmt\"\n\nconst _POSTag_name = \"XUNKNOWN_TAGROOT_TAGCCCDDTEXFWINJJJJRJJSLSMDNNNNSNNPNNPSPDTPOSPRPPPRPRBRBRRBSRPSYMTOUHVBVBDVBGVBNVBPVBZWDTWPPWPWRBCOMMAFULLSTOPOPENQUOTECLOSEQUOTECOLONDOLLARHASHSIGNLEFTBRACERIGHTBRACEHYPHAFXADDNFPGWXXMAXTAG\"\n\nvar _POSTag_index = [...]uint8{0, 1, 12, 20, 22, 24, 26, 28, 30, 32, 34, 37, 40, 42, 44, 46, 49, 52, 56, 59, 62, 65, 69, 71, 74, 77, 79, 82, 84, 86, 88, 91, 94, 97, 100, 103, 106, 108, 111, 114, 119, 127, 136, 146, 151, 157, 165, 174, 184, 188, 191, 194, 197, 199, 201, 207}\n\nfunc (i POSTag) String() string {\n\tif i >= POSTag(len(_POSTag_index)-1) {\n\t\treturn fmt.Sprintf(\"POSTag(%d)\", i)\n\t}\n\treturn _POSTag_name[_POSTag_index[i]:_POSTag_index[i+1]]\n}\n"
  },
  {
    "path": "POSTag_universal.go",
    "content": "// +build !stanfordtags\n\npackage lingo\n\n//go:generate stringer -type=POSTag -output=POSTag_universal_string.go\n\nconst BUILD_TAGSET = \"universaltags\"\n\nconst (\n\tX POSTag = iota // aka NULLTAG\n\tUNKNOWN_TAG\n\tROOT_TAG\n\tADJ\n\tADP\n\tADV\n\tAUX\n\tCONJ\n\tDET\n\tINTJ\n\tNOUN\n\tNUM\n\tPART\n\tPRON\n\tPROPN\n\tPUNCT\n\tSCONJ\n\tSYM\n\tVERB\n\n\tMAXTAG // MAXTAG is provided here as index support\n)\n\n// POSTagShortcut is a shortcut function to help the POSTagger shortcircuit some decisions about what the tag is\nfunc POSTagShortcut(l Lexeme) (POSTag, bool) {\n\tswitch l.LexemeType {\n\tcase Number:\n\t\treturn NUM, true\n\tcase Punctuation:\n\t\treturn PUNCT, true\n\tcase Symbol:\n\t\treturn SYM, true\n\tcase URI:\n\t\treturn X, true\n\tcase Date:\n\t\treturn NUM, true\n\tcase Time:\n\t\treturn NUM, true\n\tcase EOF:\n\t\treturn X, true\n\t}\n\treturn X, false\n}\n\nvar Adjectives = []POSTag{ADJ}\nvar Nouns = []POSTag{NOUN, PROPN}\nvar ProperNouns = []POSTag{PROPN}\nvar Verbs = []POSTag{VERB}\nvar Adverbs = []POSTag{ADV}\nvar Determiners = []POSTag{DET}\nvar Interrogatives = []POSTag{PRON, DET, ADV}\nvar Numbers = []POSTag{NUM}\nvar Symbols = []POSTag{SYM, PUNCT}\n\n// IsIN returns true if the POSTag is a subordinating conjunction.\n// The reason why this exists is because in the stanford tag, IN is the POSTag\n// while in the universal dependencies, it's the SCONJ POSTag\nfunc IsIN(x POSTag) bool { return x == SCONJ }\n"
  },
  {
    "path": "POSTag_universal_string.go",
    "content": "// +build !stanfordtags\n\n// Code generated by \"stringer -type=POSTag -output=POSTag_universal_string.go\"; DO NOT EDIT\n\npackage lingo\n\nimport \"fmt\"\n\nconst _POSTag_name = \"XUNKNOWN_TAGROOT_TAGADJADPADVAUXCONJDETINTJNOUNNUMPARTPRONPROPNPUNCTSCONJSYMVERBMAXTAG\"\n\nvar _POSTag_index = [...]uint8{0, 1, 12, 20, 23, 26, 29, 32, 36, 39, 43, 47, 50, 54, 58, 63, 68, 73, 76, 80, 86}\n\nfunc (i POSTag) String() string {\n\tif i >= POSTag(len(_POSTag_index)-1) {\n\t\treturn fmt.Sprintf(\"POSTag(%d)\", i)\n\t}\n\treturn _POSTag_name[_POSTag_index[i]:_POSTag_index[i+1]]\n}\n"
  },
  {
    "path": "README.md",
    "content": "# lingo #\n\n<img src=\"https://raw.githubusercontent.com/chewxy/lingo/master/media/gopher_small.png\" align=\"right\" />\n\n[![Build Status](https://travis-ci.org/chewxy/lingo.svg?branch=master)](https://travis-ci.org/chewxy/lingo)\n\npackage `lingo` provides the data structures and algorithms required for natural language processing.\n\nSpecifically, it provides a POS Tagger (`lingo/pos`), a Dependency Parser (`lingo/dep`), and a basic tokenizer (`lingo/lexer`) for English. It also provides data structures for holding corpuses (`lingo/corpus`), and treebanks (`lingo/treebank`).\n\nThe aim of this package is to provide a production quality pipeline for natural language processing.\n\n# Install #\n\nThe package is go-gettable: `go get -u github.com/chewxy/lingo`\n\nThis package and its subpackages depend on very few external packages. Here they are:\n\n| Package | Used For | Vitality | Notes | Licence |\n|---------|----------|----------|-------|---------|\n| [gorgonia](https://github.com/chewxy/gorgonia) | Machine learning | Vital. It won't be hard to rewrite them, but why? | Same author | [Gorgonia Licence](https://github.com/chewxy/gorgonia/blob/master/LICENSE) (Apache 2.0-like) |\n| [gographviz](https://github.com/awalterschulze/gographviz) | Visualization of annotations, and other graph-related visualizations | Vital for visualizations, which are a nice-to-have feature | API last changed 12th April 2017 | [gographviz licence](https://github.com/awalterschulze/gographviz/blob/master/LICENSE) (Apache 2.0) |\n| [errors](https://github.com/pkg/errors)  | Errors   | The package won't die without it, but it's a very nice to have | Stable API for the past year | [errors licence](https://github.com/pkg/errors/blob/master/LICENSE) (MIT/BSD like) |\n| [set](https://github.com/xtgo/set) | Set operations | Can be easily replaced | Stable API for the past year | [set licence](https://github.com/xtgo/set/blob/master/LICENSE) (MIT/BSD-like) |\n\n# Usage #\n\nSee the individual packages for usage. There is also a bunch of executables in the `cmd` directory. They're meant to be examples as to how a natural language processing pipeline can be set up.\n\nA natural language pipeline with this package is heavily channels driven. Here's is an example for dependency parsing:\n\n```go\nfunc main() {\n\tinputString: `The cat sat on the mat`\n\tlx := lexer.New(\"dummy\", strings.NewReader(inputString)) // lexer - required to break a sentence up into words.\n\tpt := pos.New(pos.WithModel(posModel))                   // POS Tagger - required to tag the words with a part of speech tag.\n\tdp := dep.New(depModel)                                  // Creates a new parser\n\n\t// set up a pipeline\n\tpt.Input = lx.Output\n\tdp.Input = pt.Output\n\n\t// run all\n\tgo lx.Run()\n\tgo pt.Run()\n\tgo dp.Run()\n\n\t// wait to receive:\n\tfor {\n\t\tselect {\n\t\tcase d := <- dp.Output:\n\t\t\t// do something\n\t\tcase err:= <-dp.Error:\n\t\t\t// handle error\n\t\t}\n\t}\n\n}\n\n```\n\n\n\n# How It Works #\nFor specific tasks (POS tagging, parsing, named entity recognition etc), refer to the README of each subpackage. This package on its own mainly provides the data structures that the subpackages will use.\n\nPerhaps the most important data structure is the `*Annotation` structure. It basically holds a word and the associated metadata for the word.\n\nFor dependency parses, the graph takes three forms: `*Dependency`, `*DependencyTree` and `*Annotation`. All three forms are convertable from one to another. TODO: explain rationale behind each data type.\n\n## Quirks ##\n\n### Very Oddly Specific POS Tags and Dependency Rel Types ###\n\nA particular quirk you may have noticed is that the `POSTag` and `DependencyType` are hard coded in as constants. This package does in fact provide two variations of each: one from Stanford/Penn Treebank and one from [UniversalDependencies](http://universaldependencies.org/).\n\nThe main reason for hardcoding these are mainly for performance reasons - knowing ahead how much to allocate reduces a lot of additional work the program has to do. It also reduces the chances of mutating a global variable.\n\nOf course this comes as a tradeoff - programs are limited to these two options. Thankfully there are only a limited number of POS Tag and Dependency Relation types. Two of the most popular ones (Stanford/PTB and Universal Dependencies) have been implemented.\n\nThe following build tags are supported:\n\n* stanfordtags\n* universaltags\n* stanfordrel\n* universalrel\n\nTo use a specific tagset or relset, build your program thusly: `go build -tags='stanfordtags'`.\n\nThe default tag and dependency rel types are the universal dependencies version.\n\n### Lexer ###\n\nYou should also note that the tokenizer, `lingo/lexer` is not your usual run-of-the-mill NLP tokenizer. It's a tokenizer that tokenizes by space, with some specific rules for English. It was inspired by Rob Pike's talk on lexers. I thought it'd be cool to write something like that for NLP.\n\nThe test cases in package `lingo/lexer` showcases how it handles unicode, and other pathalogical english.\n\n# Contributing #\nsee CONTRIBUTING.md for more info\n\n# Licence #\n\nThis package is licenced under the MIT licence.\n"
  },
  {
    "path": "annotation.go",
    "content": "package lingo\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"strings\"\n)\n\n// Annotation is the word and it's metadata.\n// This includes the position, its dependency head (if available), its lemma, POSTag, etc\n//\n// A collection of Annoations - AnnotatedSentence is also a representation of a dependency parse\n//\n// Every field is exported for easy gobbing. be very careful with setting stuff\ntype Annotation struct {\n\tLexeme\n\tPOSTag\n\t// NER\n\n\t// fields to do with an annotation being in a collection\n\tDependencyType\n\tID       int\n\tHead     *Annotation\n\tchildren AnnotationSet //will not be serialized\n\n\t// info about the annotation itself\n\tLemma   string\n\tLowered string\n\tStem    string\n\n\t// auxiliary data for processing\n\tCluster\n\tShape\n\tWordFlag\n}\n\nfunc NewAnnotation() *Annotation {\n\treturn &Annotation{\n\t\tLexeme: nullLexeme,\n\t\tLemma:  \"\",\n\t\tShape:  Shape(\"\"),\n\t}\n}\n\n// AnnotationFromLexTag is only ever used in tests. Fixer is optional\nfunc AnnotationFromLexTag(l Lexeme, t POSTag, f AnnotationFixer) *Annotation {\n\ta := &Annotation{\n\t\tLexeme:         l,\n\t\tPOSTag:         t,\n\t\tDependencyType: NoDepType,\n\t\tLemma:          \"\",\n\t\tLowered:        strings.ToLower(l.Value),\n\t}\n\n\t// it's ok to panic - it will cause the tests to fail\n\tif err := a.Process(f); err != nil {\n\t\tpanic(err)\n\t}\n\n\treturn a\n}\n\nfunc (a *Annotation) Clone() *Annotation {\n\tb := *a\n\tb.ID = -1\n\tb.Head = nil\n\tb.children = nil\n\tb.DependencyType = NoDepType\n\n\treturn &b\n}\n\nfunc (a *Annotation) SetHead(headAnn *Annotation) {\n\ta.Head = headAnn\n\tif headAnn != rootAnnotation && headAnn != startAnnotation && headAnn != nullAnnotation {\n\t\theadAnn.children = append(headAnn.children, a)\n\t}\n}\n\nfunc (a *Annotation) HeadID() int {\n\tif a.Head != nil {\n\t\treturn a.Head.ID\n\t}\n\treturn -1\n}\n\nfunc (a *Annotation) IsNumber() bool {\n\treturn IsNumber(a.POSTag) && (a.LexemeType != Date && a.LexemeType != Time && a.LexemeType != URI)\n}\n\nfunc (a *Annotation) String() string {\n\treturn a.Value\n}\n\nfunc (a *Annotation) GoString() string {\n\ts := fmt.Sprintf(\"%q/%s\", a.Lexeme.Value, a.POSTag)\n\n\tif a.Head != nil {\n\t\treturn fmt.Sprintf(\"(%v) <-%v- (%q/%s) \", s, a.DependencyType, a.Head.Value, a.Head.POSTag)\n\t}\n\treturn s\n}\n\nfunc (a *Annotation) Process(f AnnotationFixer) error {\n\tif a.Lexeme != nullLexeme {\n\t\ta.Lowered = strings.ToLower(a.Value)\n\t\ta.Shape = a.Lexeme.Shape()\n\t\ta.WordFlag = a.Lexeme.Flags()\n\n\t\tvar err error\n\t\tif f != nil {\n\t\t\tvar stem string\n\t\t\tif stem, err = f.Stem(a.Lowered); err != nil {\n\t\t\t\tif _, ok := err.(componentUnavailable); !ok {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t}\n\t\t\ta.Stem = stem\n\n\t\t\tvar clust map[string]Cluster\n\t\t\tif clust, err = f.Clusters(); err == nil {\n\t\t\t\ta.Cluster = clust[a.Value]\n\t\t\t}\n\t\t}\n\n\t\treturn nil\n\t}\n\treturn errors.New(\"No Lexeme!\")\n}\n\nvar rootAnnotation = &Annotation{\n\tLexeme:         rootLexeme,\n\tPOSTag:         ROOT_TAG,\n\tDependencyType: Root,\n\tID:             0,\n\tHead:           nil,\n\tLemma:          \"\",\n\tLowered:        \"\",\n\tCluster:        0,\n\tShape:          \"\",\n\tWordFlag:       NoFlag,\n}\n\nvar startAnnotation = &Annotation{\n\tLexeme:         startLexeme,\n\tPOSTag:         ROOT_TAG,\n\tDependencyType: NoDepType,\n\tID:             -1,\n\tHead:           nil,\n\tLemma:          \"\",\n\tLowered:        \"\",\n\tCluster:        0,\n\tShape:          \"\",\n\tWordFlag:       NoFlag,\n}\n\nvar nullAnnotation = &Annotation{\n\tLexeme:         nullLexeme,\n\tPOSTag:         X,\n\tDependencyType: NoDepType,\n\tID:             -1,\n\tHead:           nil,\n\tLemma:          \"\",\n\tLowered:        \"\",\n\tCluster:        0,\n\tShape:          \"\",\n\tWordFlag:       NoFlag,\n}\n\nfunc RootAnnotation() *Annotation  { return rootAnnotation }\nfunc StartAnnotation() *Annotation { return startAnnotation }\nfunc NullAnnotation() *Annotation  { return nullAnnotation }\n\nfunc StringToAnnotation(s string, f AnnotationFixer) *Annotation {\n\tl := MakeLexeme(s, Word)\n\ta := NewAnnotation()\n\ta.Lexeme = l\n\tif err := a.Process(f); err != nil {\n\t\tpanic(err.Error())\n\t}\n\treturn a\n}\n\ntype AnnotationFixer interface {\n\tLemmatizer\n\tStemmer\n\tClusters() (map[string]Cluster, error)\n}\n"
  },
  {
    "path": "annotationSet.go",
    "content": "package lingo\n\nimport (\n\t\"sort\"\n\t\"unsafe\"\n\n\t\"github.com/xtgo/set\"\n)\n\ntype AnnotationSet []*Annotation\n\nfunc (as AnnotationSet) Len() int      { return len(as) }\nfunc (as AnnotationSet) Swap(i, j int) { as[i], as[j] = as[j], as[i] }\nfunc (as AnnotationSet) Less(i, j int) bool {\n\treturn uintptr(unsafe.Pointer(as[i])) < uintptr(unsafe.Pointer(as[j]))\n}\n\nfunc (as AnnotationSet) Set() AnnotationSet {\n\tsort.Sort(as)\n\tn := set.Uniq(as)\n\treturn as[:n]\n}\n\nfunc (as AnnotationSet) Contains(a *Annotation) bool {\n\tif as.Index(a) == len(as) {\n\t\treturn false\n\t}\n\treturn true\n}\n\nfunc (as AnnotationSet) Index(a *Annotation) int {\n\tfor i, an := range as {\n\t\tif an == a {\n\t\t\treturn i\n\t\t}\n\t}\n\treturn len(as)\n}\n\nfunc (as AnnotationSet) Add(a *Annotation) AnnotationSet {\n\tif as.Contains(a) {\n\t\treturn as\n\t}\n\tas = append(as, a)\n\treturn as\n}\n"
  },
  {
    "path": "annotationSet_bench_test.go",
    "content": "package lingo\n\nimport (\n\t\"sort\"\n\t\"testing\"\n)\n\nfunc (as AnnotationSet) index2(a *Annotation) int {\n\tsort.Sort(as)\n\tf := func(i int) bool { return as[i] == a }\n\treturn sort.Search(len(as), f)\n}\n\nvar benchIndexRes int\n\nfunc benchASIndex(size int, b *testing.B) {\n\tvar as AnnotationSet\n\tfor i := 0; i < size; i++ {\n\t\tas = append(as, new(Annotation))\n\t}\n\n\tdoesntcontain := new(Annotation)\n\tcontains := as[0]\n\n\tfor n := 0; n < b.N; n++ {\n\t\tbenchIndexRes = as.Index(doesntcontain)\n\t\tbenchIndexRes = as.Index(contains)\n\t}\n}\n\nfunc benchASIndex2(size int, b *testing.B) {\n\tvar as AnnotationSet\n\tfor i := 0; i < size; i++ {\n\t\tas = append(as, new(Annotation))\n\t}\n\n\tdoesntcontain := new(Annotation)\n\tcontains := as[0]\n\n\tfor n := 0; n < b.N; n++ {\n\t\tbenchIndexRes = as.index2(doesntcontain)\n\t\tbenchIndexRes = as.index2(contains)\n\t}\n}\n\nfunc BenchmarkAnnotationSetIndex_1(b *testing.B)    { benchASIndex(1, b) }\nfunc BenchmarkAnnotationSetIndex_2(b *testing.B)    { benchASIndex(2, b) }\nfunc BenchmarkAnnotationSetIndex_8(b *testing.B)    { benchASIndex(8, b) }\nfunc BenchmarkAnnotationSetIndex_16(b *testing.B)   { benchASIndex(16, b) }\nfunc BenchmarkAnnotationSetIndex_32(b *testing.B)   { benchASIndex(32, b) }\nfunc BenchmarkAnnotationSetIndex_64(b *testing.B)   { benchASIndex(64, b) }\nfunc BenchmarkAnnotationSetIndex_128(b *testing.B)  { benchASIndex(128, b) }\nfunc BenchmarkAnnotationSetIndex_256(b *testing.B)  { benchASIndex(256, b) }\nfunc BenchmarkAnnotationSetIndex_512(b *testing.B)  { benchASIndex(512, b) }\nfunc BenchmarkAnnotationSetIndex_1024(b *testing.B) { benchASIndex(1024, b) }\n\nfunc BenchmarkAnnotationSetIndex2_1(b *testing.B)    { benchASIndex2(1, b) }\nfunc BenchmarkAnnotationSetIndex2_2(b *testing.B)    { benchASIndex2(2, b) }\nfunc BenchmarkAnnotationSetIndex2_8(b *testing.B)    { benchASIndex2(8, b) }\nfunc BenchmarkAnnotationSetIndex2_16(b *testing.B)   { benchASIndex2(16, b) }\nfunc BenchmarkAnnotationSetIndex2_32(b *testing.B)   { benchASIndex2(32, b) }\nfunc BenchmarkAnnotationSetIndex2_64(b *testing.B)   { benchASIndex2(64, b) }\nfunc BenchmarkAnnotationSetIndex2_128(b *testing.B)  { benchASIndex2(128, b) }\nfunc BenchmarkAnnotationSetIndex2_256(b *testing.B)  { benchASIndex2(256, b) }\nfunc BenchmarkAnnotationSetIndex2_512(b *testing.B)  { benchASIndex2(512, b) }\nfunc BenchmarkAnnotationSetIndex2_1024(b *testing.B) { benchASIndex2(1024, b) }\n"
  },
  {
    "path": "browncluster.go",
    "content": "package lingo\n\nimport (\n\t\"bufio\"\n\t\"io\"\n\t\"strconv\"\n\t\"strings\"\n)\n\n// this file provides IO support and type safety for brown clusters.\n// The creation of brownclusters is not done here.\n// Right now lingo does not generate clusters - use PercyLiang's excellent tool for that\n\n// Cluster represents a brown cluster\ntype Cluster int\n\n// ReadCluster reads PercyLiang's cluster file format and returns a map of strings to Cluster\nfunc ReadCluster(r io.Reader) map[string]Cluster {\n\tscanner := bufio.NewScanner(r)\n\tclusters := make(map[string]Cluster)\n\n\tfor scanner.Scan() {\n\t\tline := scanner.Text()\n\n\t\tsplits := strings.Split(line, \"\\t\")\n\t\tvar word string\n\t\tvar cluster, freq int\n\n\t\tword = splits[1]\n\n\t\tvar i64 int64\n\t\tvar err error\n\t\tif i64, err = strconv.ParseInt(splits[0], 2, 64); err != nil {\n\t\t\tpanic(err)\n\t\t}\n\t\tcluster = int(i64)\n\n\t\tif freq, err = strconv.Atoi(splits[2]); err != nil {\n\t\t\tpanic(err)\n\t\t}\n\n\t\t// if clusterer has only seen a word a few times, then the cluster is not reliable\n\t\tif freq >= 3 {\n\t\t\tclusters[word] = Cluster(cluster)\n\t\t} else {\n\t\t\tclusters[word] = Cluster(0)\n\t\t}\n\t}\n\n\t// expand clusters with recasing\n\tfor word, clust := range clusters {\n\t\tlowered := strings.ToLower(word)\n\t\tif _, ok := clusters[lowered]; !ok {\n\t\t\tclusters[lowered] = clust\n\t\t}\n\n\t\ttitled := strings.ToTitle(word)\n\t\tif _, ok := clusters[titled]; !ok {\n\t\t\tclusters[titled] = clust\n\t\t}\n\n\t\tuppered := strings.ToUpper(word)\n\t\tif _, ok := clusters[uppered]; !ok {\n\t\t\tclusters[uppered] = clust\n\t\t}\n\t}\n\n\treturn clusters\n}\n"
  },
  {
    "path": "cmd/demo/io.go",
    "content": "package main\n\nimport (\n\t\"log\"\n\t\"os\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/dep\"\n\t\"github.com/chewxy/lingo/pos\"\n)\n\nconst (\n\tposModelFile = `model/pos_stanfordtags_universalrel.final.model`\n\tdepModelFile = `model/dep_stanfordtags_universalrel.final.model`\n\tbrownCluster = `clusters.txt`\n)\n\nfunc io() {\n\tvar err error\n\tlog.Println(\"loading POS Tagger model\")\n\tif posModel, err = pos.Load(posModelFile); err != nil {\n\t\tlog.Fatal(err)\n\t}\n\n\tlog.Println(\"loading Dependency Parser model\")\n\tif depModel, err = dep.Load(depModelFile); err != nil {\n\t\tlog.Fatal(err)\n\t}\n\tvar f *os.File\n\tif f, err = os.Open(brownCluster); err != nil {\n\t\tlog.Fatal(err)\n\t}\n\tclusters = lingo.ReadCluster(f)\n}\n"
  },
  {
    "path": "cmd/demo/main.go",
    "content": "package main\n\nimport (\n\t\"io/ioutil\"\n\t\"os\"\n\t\"os/exec\"\n\n\t\"github.com/abiosoft/ishell\"\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/pkg/browser\"\n)\n\nfunc main() {\n\tio()\n\tshell := ishell.New()\n\n\tvar d *lingo.Dependency\n\t// var sent lingo.AnnotatedSentence\n\tvar err error\n\tshell.AddCmd(&ishell.Cmd{\n\t\tName: \"dep\",\n\t\tHelp: \"perform dependency parsing\",\n\t\tFunc: func(c *ishell.Context) {\n\t\t\tc.ShowPrompt(false)\n\t\t\tdefer c.ShowPrompt(true)\n\n\t\t\tc.Print(\"Query: \")\n\t\t\tquery := c.ReadLine()\n\n\t\t\tif d, err = pipeline(query); err != nil {\n\t\t\t\tc.Printf(\"Error: %v\", err)\n\t\t\t}\n\n\t\t\tc.Printf(\"%v\\n\", d)\n\t\t},\n\t})\n\n\tshell.AddCmd(&ishell.Cmd{\n\t\tName: \"show\",\n\t\tHelp: \"show dependency parse on browser\",\n\t\tFunc: func(c *ishell.Context) {\n\t\t\tvar tmp *os.File\n\t\t\tif tmp, err = ioutil.TempFile(\"\", \"dep\"); err != nil {\n\t\t\t\tc.Printf(\"Cannot open file %v\\n\", err)\n\t\t\t\treturn\n\t\t\t}\n\t\t\tdefer os.Remove(tmp.Name())\n\n\t\t\tc.Printf(\"%v\\n\", tmp.Name())\n\n\t\t\tdot := d.Tree().Dot()\n\t\t\ttmp.Write([]byte(dot))\n\t\t\tif err := tmp.Close(); err != nil {\n\t\t\t\tc.Printf(\"Error closing file %v\", err)\n\t\t\t}\n\t\t\tcmd := exec.Command(\"dot\", \"-Tpng\", \"-O\", tmp.Name())\n\t\t\tif err = cmd.Run(); err != nil {\n\t\t\t\tc.Printf(\"Cannot execute dot: %v\\n\", err)\n\t\t\t}\n\n\t\t\tbrowser.OpenFile(tmp.Name() + \".png\")\n\n\t\t},\n\t})\n\tshell.Start()\n}\n"
  },
  {
    "path": "cmd/demo/nlp.go",
    "content": "package main\n\nimport (\n\t\"fmt\"\n\t\"strings\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/dep\"\n\t\"github.com/chewxy/lingo/lexer\"\n\t\"github.com/chewxy/lingo/pos\"\n\t\"github.com/kljensen/snowball\"\n\t\"github.com/pkg/errors\"\n)\n\nvar posModel *pos.Model\nvar depModel *dep.Model\n\nvar clusters map[string]lingo.Cluster\n\ntype stemmer struct{}\n\nfunc (stemmer) Stem(a string) (string, error) {\n\treturn snowball.Stem(a, \"english\", true)\n}\n\ntype fixer struct {\n\tstemmer\n}\n\nfunc (f fixer) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil }\nfunc (f fixer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) {\n\treturn nil, nocomp(\"lemmatizer\")\n}\n\ntype nocomp string\n\nfunc (e nocomp) Error() string     { return fmt.Sprintf(\"no %v\", string(e)) }\nfunc (e nocomp) Component() string { return string(e) }\n\nfunc pipeline(s string) (d *lingo.Dependency, err error) {\n\tif posModel == nil || depModel == nil {\n\t\treturn nil, errors.Errorf(\"Unable to create a pipeline\")\n\t}\n\tlx := lexer.New(s, strings.NewReader(s))\n\tpt := pos.New(pos.WithModel(posModel), pos.WithStemmer(stemmer{}))\n\tdp := dep.New(depModel)\n\n\t// pipeline\n\tpt.Input = lx.Output\n\tdp.Input = pt.Output\n\n\tgo lx.Run()\n\tgo pt.Run()\n\tgo dp.Run()\n\n\tvar ok bool\n\tfor {\n\t\tselect {\n\t\tcase d, ok = <-dp.Output:\n\t\t\tif !ok {\n\t\t\t\tcontinue\n\t\t\t}\n\t\t\treturn\n\t\tcase err = <-dp.Error:\n\t\t\treturn\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "cmd/dep/fixer.go",
    "content": "package main\n\nimport (\n\t\"fmt\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/kljensen/snowball\"\n)\n\ntype stemmer struct{}\n\nfunc (stemmer) Stem(a string) (string, error) {\n\treturn snowball.Stem(a, \"english\", true)\n}\n\ntype fixer struct {\n\tstemmer\n}\n\nfunc (f fixer) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil }\nfunc (f fixer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) {\n\treturn nil, nocomp(\"lemmatizer\")\n}\n\ntype nocomp string\n\nfunc (e nocomp) Error() string     { return fmt.Sprintf(\"no %v\", string(e)) }\nfunc (e nocomp) Component() string { return string(e) }\n"
  },
  {
    "path": "cmd/dep/io.go",
    "content": "package main\n\nimport (\n\t\"log\"\n\n\t\"github.com/chewxy/lingo/dep\"\n\t\"github.com/chewxy/lingo/pos\"\n\t\"github.com/chewxy/lingo/treebank\"\n)\n\nfunc validateFlags() {\n\tif *load == \"\" && *trainFile == \"\" {\n\t\tlog.Fatal(\"Must either load a model or pass in a training file\")\n\t}\n\n\tif *epoch < 0 {\n\t\tlog.Fatal(\"epochs must only be positive numbers\")\n\t}\n\n\tif *load != \"\" {\n\t\ttoLoad = true\n\t}\n\n\tif *trainFile != \"\" {\n\t\ttoTrain = true\n\t}\n\n\tif *testFile != \"\" {\n\t\t*cv = true\n\t}\n\n\t// warnings\n\tif *load == \"\" && *save == \"\" {\n\t\tlog.Println(\"WARNING: Models that have been trained will NOT be saved\")\n\t}\n}\n\nfunc loadTreebanks() {\n\tif *trainFile != \"\" {\n\t\ttrainTB = treebank.LoadUniversal(*trainFile)\n\t}\n\n\tif *testFile != \"\" {\n\t\ttestTB = treebank.LoadUniversal(*testFile)\n\t}\n}\n\nfunc loadPOSModel() {\n\tvar err error\n\tif *loadPOS == \"\" {\n\t\tlog.Fatal(\"Cannot proceed without having a POS model\")\n\t}\n\tif POSModel, err = pos.Load(*loadPOS); err != nil {\n\t\tlog.Fatal(err)\n\t}\n}\n\nfunc loadDepModel() {\n\tvar err error\n\n\tif DepModel, err = dep.Load(*load); err != nil {\n\t\tlog.Fatal(err)\n\t}\n}\n\nfunc saveModel() {\n\tif *save != \"\" && DepModel != nil {\n\t\tDepModel.Save(*save)\n\t}\n}\n"
  },
  {
    "path": "cmd/dep/main.go",
    "content": "package main\n\nimport (\n\t\"flag\"\n\t\"log\"\n\t\"os\"\n\t\"os/signal\"\n\t\"runtime/pprof\"\n\t\"syscall\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/dep\"\n\t\"github.com/chewxy/lingo/pos\"\n)\n\nvar save = flag.String(\"save\", \"\", \"save as...\")\nvar load = flag.String(\"load\", \"\", \"load a model\")\nvar loadPOS = flag.String(\"PTmodel\", \"\", \"load a POS Tagger model\")\nvar clusterFiles = flag.String(\"cluster\", \"\", \"Brown Cluster files. If nothing is passed in, then the brown cluster won't be used\")\nvar trainFile = flag.String(\"train\", \"\", \"Training on... (Only CONLLU formatted training files are accepted)\")\nvar testFile = flag.String(\"test\", \"\", \"Test on... (Only CONLLU formatted training files are accepted). If this is not provided, the model will be trained without crossvalidation\")\nvar cv = flag.Bool(\"cv\", false, \"Cross validate training model? Defaults to false.\")\nvar epoch = flag.Int(\"epoch\", 10, \"Training epochs. Defaults to 10\")\nvar format = flag.String(\"f\", \"\", \"Format to output. Default is none. Accepts: {json, dot}\")\n\nvar cpuprofile = flag.String(\"cpuprofile\", \"\", \"write cpu profile to file\")\nvar memprofile = flag.String(\"memprofile\", \"\", \"write memory profile to this file\")\n\nvar clusters map[string]lingo.Cluster\nvar POSModel *pos.Model\nvar DepModel *dep.Model\nvar toLoad, toTrain bool\n\nfunc init() {\n\tif lingo.BUILD_TAGSET != \"stanfordtags\" && lingo.BUILD_TAGSET != \"universaltags\" {\n\t\tlog.Fatalf(\"Tagset %q unsupported\", lingo.BUILD_TAGSET)\n\t}\n\n\tif lingo.BUILD_RELSET != \"stanfordrel\" && lingo.BUILD_RELSET != \"universalrel\" {\n\t\tlog.Fatalf(\"Relset %q unsupported\", lingo.BUILD_RELSET)\n\t}\n}\n\nfunc cleanup(sigChan chan os.Signal, cpuprofiling, memprofiling bool) {\n\tselect {\n\tcase <-sigChan:\n\t\tlog.Println(\"EMERGENCY EXIT\")\n\t\tif cpuprofiling {\n\t\t\tpprof.StopCPUProfile()\n\n\t\t}\n\t\tif memprofiling {\n\t\t\tf, err := os.Create(*memprofile)\n\t\t\tif err != nil {\n\t\t\t\tlog.Fatal(err)\n\t\t\t}\n\t\t\tpprof.WriteHeapProfile(f)\n\t\t\tf.Close()\n\t\t}\n\t\tsaveModel()\n\t\tos.Exit(1)\n\t}\n}\n\nfunc main() {\n\tflag.Parse()\n\tvalidateFlags()\n\n\tsigChan := make(chan os.Signal, 1)\n\tsignal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)\n\tvar cpuprofiling, memprofiling bool\n\tif *cpuprofile != \"\" {\n\t\tf, err := os.Create(*cpuprofile)\n\t\tif err != nil {\n\t\t\tlog.Fatal(err)\n\t\t}\n\t\tcpuprofiling = true\n\t\tpprof.StartCPUProfile(f)\n\t\tdefer pprof.StopCPUProfile()\n\t}\n\n\tif *memprofile != \"\" {\n\t\tmemprofiling = true\n\t}\n\n\tgo cleanup(sigChan, cpuprofiling, memprofiling)\n\n\tloadPOSModel()\n\tif toLoad {\n\t\tloadDepModel()\n\t}\n\n\tif toTrain {\n\t\tloadTreebanks()\n\t\ttrain()\n\t}\n\n\tsaveModel()\n}\n"
  },
  {
    "path": "cmd/dep/pipeline.go",
    "content": "package main\n\nimport (\n\t\"encoding/json\"\n\t\"fmt\"\n\t\"strings\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/dep\"\n\t\"github.com/chewxy/lingo/lexer\"\n\t\"github.com/chewxy/lingo/pos\"\n)\n\nfunc receive(deps chan *lingo.Dependency, errs, errChan chan error) {\n\tdefer close(errChan)\n\tfor {\n\t\tselect {\n\t\tcase dep, ok := <-deps:\n\t\t\tif !ok {\n\t\t\t\tcontinue\n\t\t\t}\n\t\t\tswitch *format {\n\t\t\tcase \"json\":\n\t\t\t\tbs, _ := json.MarshalIndent(dep, \"\", \"\\t\")\n\t\t\t\tfmt.Printf(\"%s\\n\", string(bs))\n\t\t\tcase \"dot\":\n\t\t\t\tfmt.Printf(\"%v\\n\", dep.Tree().Dot())\n\t\t\t}\n\n\t\tcase err := <-errs:\n\t\t\terrChan <- err\n\t\t}\n\t}\n}\n\nfunc pipeline(s string) error {\n\tlx := lexer.New(s, strings.NewReader(s))\n\tpt := pos.New(pos.WithModel(POSModel))\n\tdp := dep.New(DepModel)\n\n\tpt.Input = lx.Output\n\tdp.Input = pt.Output\n\n\terrChan := make(chan error)\n\tgo lx.Run()\n\tgo pt.Run()\n\tgo receive(dp.Output, dp.Error, errChan)\n\tdp.Run()\n\n\treturn <-errChan\n}\n"
  },
  {
    "path": "cmd/dep/train.go",
    "content": "package main\n\nimport (\n\t\"log\"\n\n\t\"github.com/chewxy/lingo/dep\"\n\t\"github.com/chewxy/lingo/treebank\"\n\t\"gorgonia.org/tensor\"\n)\n\nvar trainTB []treebank.SentenceTag\nvar testTB []treebank.SentenceTag\n\nfunc train() {\n\tconf := dep.DefaultNNConfig\n\tconf.Dtype = tensor.Float32\n\tvar trainer *dep.Trainer\n\n\tif testTB != nil {\n\t\tlog.Printf(\"TRAINING WITH CROSSVALIDATION\")\n\t\ttrainer = dep.NewTrainer(dep.WithGeneratedCorpus(trainTB...), dep.WithTrainingSet(trainTB), dep.WithCrossValidationSet(testTB), dep.WithConfig(conf))\n\t\ttrainer.SaveBest = \"TMP.model\"\n\t\tif err := trainer.Init(); err != nil {\n\t\t\tlog.Fatalf(\"Unable to initialize trainer: \\n%+v\", err)\n\t\t}\n\n\t\tprog := trainer.Perf()\n\t\tcost := trainer.Cost()\n\t\tgo func() {\n\t\t\tfor {\n\t\t\t\tselect {\n\t\t\t\tcase p := <-prog:\n\t\t\t\t\tlog.Printf(\"%v\\n\", p)\n\t\t\t\tcase c := <-cost:\n\t\t\t\t\tlog.Printf(\"Cost %v\\n\", c)\n\t\t\t\t}\n\t\t\t}\n\t\t}()\n\n\t} else {\n\t\ttrainer = dep.NewTrainer(dep.WithGeneratedCorpus(trainTB...), dep.WithTrainingSet(trainTB), dep.WithConfig(conf))\n\t\tif err := trainer.Init(); err != nil {\n\t\t\tlog.Fatalf(\"Unable to initialize trainer: \\n%+v\", err)\n\t\t}\n\n\t\tprog := trainer.Cost()\n\t\tgo func() {\n\t\t\tfor cost := range prog {\n\t\t\t\tlog.Printf(\"Cost %v\\n\", cost)\n\t\t\t}\n\t\t}()\n\t}\n\n\tif err := trainer.Train(*epoch); err != nil {\n\t\tlog.Fatal(err)\n\t}\n\n\tDepModel = trainer.Model\n}\n"
  },
  {
    "path": "cmd/lexer/main.go",
    "content": "package main\n\nimport (\n\t\"flag\"\n\t\"fmt\"\n\t\"strings\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/lexer\"\n)\n\nvar input = flag.String(\"input\", \"\", \"input string to lex\")\nvar output = make(chan lingo.Lexeme)\n\nfunc receieve() {\n\tfor l := range output {\n\t\tfmt.Printf(\"%v\\n\", l)\n\t}\n}\n\nfunc main() {\n\tflag.Parse()\n\n\ts := *input\n\n\tgo receieve()\n\tl := lexer.New(s, strings.NewReader(s))\n\tl.Output = output\n\tl.Run()\n}\n"
  },
  {
    "path": "cmd/pos/crossvalidation.go",
    "content": "package main\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"log\"\n\t\"os\"\n\t\"strings\"\n\t\"sync\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/lexer\"\n\t\"github.com/chewxy/lingo/pos\"\n\t\"github.com/chewxy/lingo/treebank\"\n)\n\ntype testResult struct {\n\ttagged lingo.AnnotatedSentence\n\tactual lingo.AnnotatedSentence\n}\n\nfunc (tr testResult) compare() (int, bool) {\n\ttagged := tr.tagged\n\tactual := tr.actual\n\n\tvar sameLength bool = true\n\n\tif len(tagged) != len(actual) {\n\t\tsameLength = false\n\t}\n\n\tvar counter int\n\tfor i, v := range actual {\n\t\tif i >= len(tagged) {\n\t\t\tbreak\n\t\t}\n\t\tif v.POSTag == tagged[i].POSTag {\n\t\t\tcounter++\n\t\t}\n\t}\n\treturn counter, sameLength\n}\n\nfunc crossValidate(resultChan chan testResult) {\n\tdiffLengthCount := 0\n\ttotalLength := 0\n\tcorrectCount := 0\n\tsentences := 0\n\n\tvar wrongResults []testResult\n\n\tfor res := range resultChan {\n\t\tsentences++\n\t\tlength := len(res.actual)\n\t\tcc, sl := res.compare()\n\t\tif !sl {\n\t\t\tdiffLengthCount++\n\t\t}\n\t\tcorrectCount += cc\n\t\ttotalLength += length\n\n\t\tif cc != length && *inspect != \"\" {\n\t\t\twrongResults = append(wrongResults, res)\n\t\t}\n\t}\n\n\tif *inspect != \"\" {\n\t\tf, err := os.OpenFile(*inspect, os.O_WRONLY|os.O_CREATE, 0666)\n\t\tif err != nil {\n\t\t\tlog.Fatal(err)\n\t\t}\n\n\t\t// can write directly to f\n\t\tvar buf bytes.Buffer\n\t\tfor _, res := range wrongResults {\n\t\t\tfmt.Fprintf(&buf, \"Sentence: \\nW:%v\\nG:%v\\nTags:\\nW: %v\\nG: %v\\n\\n\", res.actual.StringSlice(), res.tagged.StringSlice(), res.actual.Tags(), res.tagged.Tags())\n\t\t}\n\n\t\tf.WriteString(buf.String())\n\t\tf.Close()\n\t}\n\n\tfmt.Printf(\"CrossValidation: %d/%d = %f. Differing Lengths : %d/%d = %f\\n\", correctCount, totalLength, float64(correctCount)/float64(totalLength), diffLengthCount, sentences, float64(diffLengthCount)/float64(sentences))\n}\n\nfunc collect(ch chan lingo.AnnotatedSentence, correct lingo.AnnotatedSentence, outCh chan testResult, wg *sync.WaitGroup) {\n\tdefer wg.Done()\n\n\tfor sentence := range ch {\n\t\toutCh <- testResult{sentence, correct}\n\t}\n}\n\nfunc testModel(sentences []treebank.SentenceTag) {\n\tresultChan := make(chan testResult)\n\n\tgo func() {\n\t\tdefer close(resultChan)\n\t\tvar wg sync.WaitGroup\n\t\tfor _, sentence := range sentences {\n\t\t\twg.Add(1)\n\t\t\tinput := sentence.String()\n\t\t\tcorrect := sentence.AnnotatedSentence(fixer{stemmer{}})\n\t\t\tch := make(chan lingo.AnnotatedSentence)\n\t\t\tgo collect(ch, correct, resultChan, &wg)\n\t\t\tgo cvpipeline(input, ch)\n\t\t}\n\t\twg.Wait()\n\t}()\n\n\tcrossValidate(resultChan)\n\n}\n\nfunc cvpipeline(s string, output chan lingo.AnnotatedSentence) {\n\tl := lexer.New(s, strings.NewReader(s))\n\tpt := pos.New(pos.WithModel(model))\n\n\tpt.Input = l.Output\n\tpt.Output = output\n\n\tgo l.Run()\n\tpt.Run()\n}\n"
  },
  {
    "path": "cmd/pos/fixer.go",
    "content": "// +build !chewxy\n\npackage main\n\nimport (\n\t\"fmt\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/kljensen/snowball\"\n)\n\ntype stemmer struct{}\n\nfunc (stemmer) Stem(a string) (string, error) {\n\treturn snowball.Stem(a, \"english\", true)\n}\n\ntype fixer struct {\n\tstemmer\n}\n\nfunc (f fixer) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil }\nfunc (f fixer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) {\n\treturn nil, nocomp(\"lemmatizer\")\n}\n\ntype nocomp string\n\nfunc (e nocomp) Error() string     { return fmt.Sprintf(\"no %v\", string(e)) }\nfunc (e nocomp) Component() string { return string(e) }\n"
  },
  {
    "path": "cmd/pos/main.go",
    "content": "package main\n\nimport (\n\t\"flag\"\n\t\"fmt\"\n\t\"log\"\n\t\"os\"\n\t\"os/signal\"\n\t\"runtime/pprof\"\n\t\"strings\"\n\t\"sync\"\n\t\"syscall\"\n\t\"time\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/lexer\"\n\t\"github.com/chewxy/lingo/pos\"\n\t\"github.com/chewxy/lingo/treebank\"\n)\n\nvar save = flag.String(\"save\", \"\", \"save as...\")\nvar load = flag.String(\"load\", \"\", \"load a model\")\nvar clusterFiles = flag.String(\"cluster\", \"\", \"Brown Cluster files. If nothing is passed in, then the brown cluster won't be used\")\nvar trainFile = flag.String(\"train\", \"\", \"Training on... files that end with '.conllu' will be treated as CONLLU formatted files. Files ending with '.zip' will be treted as EWT files\")\nvar testFile = flag.String(\"test\", \"\", \"Test on... Files to cross validate the model on. If this is provided, automatic crossvalidation will be done\")\nvar cv = flag.Bool(\"cv\", false, \"Cross validate training model? Defaults to false.\")\nvar epoch = flag.Int(\"epoch\", 1500, \"Training epochs. Defaults to 1500\")\nvar inspect = flag.String(\"inpect\", \"\", \"Inspect all the wrong outputs to figure out what went wrong in the POSTagging. This is useful for debugging\")\nvar input = flag.String(\"input\", \"\", \"Input sentence to tag\")\n\nvar cpuprofile = flag.String(\"cpuprofile\", \"\", \"write cpu profile to file\")\nvar memprofile = flag.String(\"memprofile\", \"\", \"write memory profile to this file\")\n\nvar clusters map[string]lingo.Cluster\nvar model *pos.Model\n\nfunc receive(sentences chan lingo.AnnotatedSentence, wg *sync.WaitGroup) {\n\tdefer wg.Done()\n\tfor sent := range sentences {\n\t\tfor _, a := range sent {\n\t\t\tfmt.Printf(\"%#v: %s| %s | %s | %d\\n\", a, a.POSTag, a.Lemma, a.WordFlag, a.Cluster)\n\t\t}\n\t}\n}\n\nfunc pipeline(s string) {\n\tl := lexer.New(s, strings.NewReader(s))\n\tpt := pos.New(pos.WithModel(model))\n\n\tpt.Input = l.Output\n\tvar wg sync.WaitGroup\n\n\tgo l.Run()\n\tgo receive(pt.Output, &wg)\n\n\twg.Add(1)\n\n\tpt.Run()\n\twg.Wait()\n}\n\nfunc validateFlags() {\n\tif *load == \"\" && *trainFile == \"\" {\n\t\tlog.Fatal(\"Must either load a model or pass in a training file\")\n\t}\n\n\tif *epoch < 0 {\n\t\tlog.Fatal(\"epochs must be positive numbers only!\")\n\t}\n\n\tif *testFile != \"\" {\n\t\t*cv = true\n\t}\n\n\t// warnings\n\n\tif *load == \"\" && *save == \"\" {\n\t\tlog.Println(\"WARNING: Models that are trained will NOT be saved\")\n\t}\n}\n\nfunc loadOrTrain() {\n\tvar trained *pos.Tagger\n\tif *clusterFiles != \"\" {\n\t\tf, err := os.Open(*clusterFiles)\n\t\tif err != nil {\n\t\t\tlog.Fatal(err)\n\t\t}\n\t\tclusters = lingo.ReadCluster(f)\n\n\t\ttrained = pos.New(pos.WithCluster(clusters), pos.WithStemmer(stemmer{}))\n\t} else {\n\t\ttrained = pos.New()\n\t}\n\n\tif *load != \"\" {\n\t\tstart := time.Now()\n\t\tvar err error\n\t\tif model, err = pos.Load(*load); err != nil {\n\t\t\tlog.Fatal(err)\n\t\t}\n\t\tlog.Printf(\"Loading model from %q took %v\", *load, time.Since(start))\n\t\treturn\n\t}\n\n\tvar sentences []treebank.SentenceTag\n\tswitch {\n\tcase strings.HasSuffix(*trainFile, \".zip\"):\n\t\tsentences = treebank.LoadEWT(*trainFile)\n\n\t\t// TODO split sentences for crossvalidation\n\n\tcase strings.HasSuffix(*trainFile, \".conllu\"):\n\t\tsentences = treebank.LoadUniversal(*trainFile)\n\tdefault:\n\t\tf, err := os.Open(*trainFile)\n\t\tif err != nil {\n\t\t\tlog.Fatal(err)\n\t\t}\n\n\t\tsentences = treebank.ReadConllu(f)\n\t}\n\n\tlog.Printf(\"Start training for %d epochs...\", *epoch)\n\tstart := time.Now()\n\ttrained.Train(sentences, *epoch)\n\tlog.Printf(\"End Training. Training took %v minutes\", time.Since(start).Minutes())\n\n\tif *save != \"\" {\n\t\ttrained.Save(*save)\n\t\tlog.Printf(\"Model saved as: %v\", *save)\n\t}\n}\n\nfunc cleanup(sigChan chan os.Signal, profiling bool) {\n\tselect {\n\tcase <-sigChan:\n\t\tlog.Println(\"EMERGENCY EXIT\")\n\t\tif profiling {\n\t\t\tpprof.StopCPUProfile()\n\t\t}\n\t\tos.Exit(1)\n\t}\n}\n\nfunc main() {\n\tflag.Parse()\n\n\tif lingo.BUILD_TAGSET != \"stanfordtags\" && lingo.BUILD_TAGSET != \"universaltags\" {\n\t\tlog.Fatalf(\"Tagset: %v is unsupported\", lingo.BUILD_TAGSET)\n\t}\n\n\tsigChan := make(chan os.Signal, 1)\n\tsignal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)\n\n\tvar profiling bool\n\tif *cpuprofile != \"\" {\n\t\tf, err := os.Create(*cpuprofile)\n\t\tif err != nil {\n\t\t\tlog.Fatal(err)\n\t\t}\n\t\tprofiling = true\n\t\tpprof.StartCPUProfile(f)\n\t\tdefer pprof.StopCPUProfile()\n\t}\n\n\tgo cleanup(sigChan, profiling)\n\n\tvalidateFlags()\n\tloadOrTrain()\n\n\tif *memprofile != \"\" {\n\t\tf, err := os.Create(*memprofile)\n\t\tif err != nil {\n\t\t\tlog.Fatal(err)\n\t\t}\n\t\tpprof.WriteHeapProfile(f)\n\t\tf.Close()\n\t}\n\n\tif *input != \"\" {\n\t\tpipeline(*input)\n\t}\n\n\tif *cv {\n\t\tlog.Printf(\"Cross Validating now\")\n\t\ttestSentences := treebank.LoadUniversal(*testFile)\n\t\ttestModel(testSentences)\n\t}\n\n}\n"
  },
  {
    "path": "const.go",
    "content": "package lingo\n\n// constants that are not pertaining to build tags\n\nvar empty struct{}\n\n// NumberWords was generated with this python code\n/*\n\tnumberWords = {}\n\n\tsimple = '''zero one two three four five six seven eight nine ten eleven twelve\n\t        thirteen fourteen fifteen sixteen seventeen eighteen nineteen\n\t        twenty'''.split()\n\tfor i, word in zip(xrange(0, 20+1), simple):\n\t    numberWords[word] = i\n\n\ttense = '''thirty forty fifty sixty seventy eighty ninety hundred'''.split()\n\tfor i, word in zip(xrange(30, 100+1, 10), tense):\n\t\tnumberWords[word] = i\n\n\tlarges = '''thousand million billion trillion quadrillion quintillion sextillion septillion'''.split()\n\tfor i, word in zip(xrange(3, 24+1, 3), larges):\n\t\tnumberWords[word] = 10**i\n*/\nvar NumberWords = map[string]int{\n\t\"zero\":        0,\n\t\"one\":         1,\n\t\"two\":         2,\n\t\"three\":       3,\n\t\"four\":        4,\n\t\"five\":        5,\n\t\"six\":         6,\n\t\"seven\":       7,\n\t\"eight\":       8,\n\t\"nine\":        9,\n\t\"ten\":         10,\n\t\"eleven\":      11,\n\t\"twelve\":      12,\n\t\"thirteen\":    13,\n\t\"fourteen\":    14,\n\t\"fifteen\":     15,\n\t\"sixteen\":     16,\n\t\"nineteen\":    19,\n\t\"seventeen\":   17,\n\t\"eighteen\":    18,\n\t\"twenty\":      20,\n\t\"thirty\":      30,\n\t\"forty\":       40,\n\t\"fifty\":       50,\n\t\"sixty\":       60,\n\t\"seventy\":     70,\n\t\"eighty\":      80,\n\t\"ninety\":      90,\n\t\"hundred\":     100,\n\t\"thousand\":    1000,\n\t\"million\":     1000000,\n\t\"billion\":     1000000000,\n\t\"trillion\":    1000000000000,\n\t\"quadrillion\": 1000000000000000,\n\t// \"quintillion\": 1000000000000000000,\n\t// \"sextillion\": 1000000000000000000000,\n\t// \"septillion\": 1000000000000000000000000,\n}\n"
  },
  {
    "path": "corpus/consopt.go",
    "content": "package corpus\n\nimport (\n\t\"log\"\n\t\"sort\"\n\t\"sync/atomic\"\n\t\"unicode/utf8\"\n\n\t\"github.com/pkg/errors\"\n\t\"github.com/xtgo/set\"\n)\n\n// ConsOpt is a construction option for manual creation of a Corpus\ntype ConsOpt func(c *Corpus) error\n\n// WithWords creates a corpus from a word list. It may have repeated words\nfunc WithWords(a []string) ConsOpt {\n\tf := func(c *Corpus) error {\n\t\ts := set.Strings(a)\n\t\tc.words = s\n\t\tc.frequencies = make([]int, len(s))\n\n\t\tids := make(map[string]int)\n\t\tmaxID := len(s)\n\n\t\tvar totalFreq, maxWL int\n\t\t// NOTE: here we're iterating over the set of words\n\t\tfor i, w := range s {\n\t\t\truneCount := utf8.RuneCountInString(w)\n\t\t\tif runeCount > c.maxWordLength {\n\t\t\t\tmaxWL = runeCount\n\t\t\t}\n\n\t\t\tids[w] = i\n\t\t}\n\n\t\t// NOTE: here we're iterating over the original word list.\n\t\tfor _, w := range a {\n\t\t\tc.frequencies[ids[w]]++\n\t\t\ttotalFreq++\n\t\t}\n\n\t\tc.ids = ids\n\t\tatomic.AddInt64(&c.maxid, int64(maxID))\n\t\tc.totalFreq = totalFreq\n\t\tc.maxWordLength = maxWL\n\t\treturn nil\n\t}\n\treturn f\n}\n\n// WithOrderedWords creates a Corpus with the given word order\nfunc WithOrderedWords(a []string) ConsOpt {\n\tf := func(c *Corpus) error {\n\t\ts := a\n\t\tc.words = s\n\t\tc.frequencies = make([]int, len(s))\n\t\tfor i := range c.frequencies {\n\t\t\tc.frequencies[i] = 1\n\t\t}\n\n\t\tids := make(map[string]int)\n\t\tmaxID := len(s)\n\t\ttotalFreq := len(s)\n\t\tvar maxWL int\n\t\tfor i, w := range a {\n\t\t\truneCount := utf8.RuneCountInString(w)\n\t\t\tif runeCount > c.maxWordLength {\n\t\t\t\tmaxWL = runeCount\n\t\t\t}\n\t\t\tids[w] = i\n\t\t}\n\n\t\tc.ids = ids\n\t\tatomic.AddInt64(&c.maxid, int64(maxID))\n\t\tc.totalFreq = totalFreq\n\t\tc.maxWordLength = maxWL\n\t\treturn nil\n\t}\n\treturn f\n}\n\n// WithSize preallocates all the things in Corpus\nfunc WithSize(size int) ConsOpt {\n\treturn func(c *Corpus) error {\n\t\tc.words = make([]string, 0, size)\n\t\tc.frequencies = make([]int, 0, size)\n\t\treturn nil\n\t}\n}\n\n// FromDict is a construction option to take a map[string]int where the int represents the word ID.\n// This is useful for constructing corpuses from foreign sources where the ID mappings are important\nfunc FromDict(d map[string]int) ConsOpt {\n\treturn func(c *Corpus) error {\n\t\tvar a sortutil\n\t\tfor k, v := range d {\n\t\t\ta.words = append(a.words, k)\n\t\t\ta.ids = append(a.ids, v)\n\t\t}\n\t\tsort.Sort(&a)\n\t\tc.ids = make(map[string]int)\n\t\tfor i, w := range a.words {\n\t\t\tif i != a.ids[i] {\n\t\t\t\treturn errors.Errorf(\"Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!\", i, i, a.ids[i])\n\t\t\t}\n\t\t\tc.words = append(c.words, w)\n\t\t\tc.frequencies = append(c.frequencies, 1)\n\t\t\tc.ids[w] = i\n\n\t\t\tc.totalFreq++\n\t\t\truneCount := utf8.RuneCountInString(w)\n\t\t\tif runeCount > c.maxWordLength {\n\t\t\t\tlog.Printf(\"FD MaxWordLength %d - %q\", runeCount, w)\n\t\t\t\tc.maxWordLength = runeCount\n\t\t\t}\n\t\t}\n\t\tc.maxid = int64(len(a.words))\n\t\treturn nil\n\t}\n\n}\n\n// FromDictWithFreq is like FromDict, but also has a frequency.\nfunc FromDictWithFreq(d map[string]struct{ ID, Freq int }) ConsOpt {\n\treturn func(c *Corpus) error {\n\t\tvar a sortutil\n\t\tfor k, v := range d {\n\t\t\ta.words = append(a.words, k)\n\t\t\ta.ids = append(a.ids, v.ID)\n\t\t\ta.freqs = append(a.freqs, v.Freq)\n\t\t}\n\t\tsort.Sort(&a)\n\t\tc.ids = make(map[string]int)\n\t\tfor i, w := range a.words {\n\t\t\tif i != a.ids[i] {\n\t\t\t\treturn errors.Errorf(\"Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!\", i, i, a.ids[i])\n\t\t\t}\n\t\t\tc.words = append(c.words, w)\n\t\t\tc.frequencies = append(c.frequencies, a.freqs[i])\n\t\t\tc.ids[w] = i\n\n\t\t\tc.totalFreq += a.freqs[i]\n\t\t\truneCount := utf8.RuneCountInString(w)\n\t\t\tif runeCount > c.maxWordLength {\n\t\t\t\tc.maxWordLength = runeCount\n\t\t\t}\n\t\t}\n\t\tc.maxid = int64(len(a.words))\n\t\treturn nil\n\t}\n}\n"
  },
  {
    "path": "corpus/corpus.go",
    "content": "package corpus\n\nimport (\n\t\"sync/atomic\"\n\t\"unicode/utf8\"\n\n\t\"github.com/pkg/errors\"\n)\n\n// Corpus is a data structure holding the relevant metadata and information for a corpus of text.\n// It serves as vocabulary with ID for lookup. This is very useful as neural networks rely on the IDs rather than the text themselves\ntype Corpus struct {\n\twords       []string\n\tfrequencies []int\n\n\tids map[string]int\n\n\t// atomic read and write plz\n\tmaxid         int64\n\ttotalFreq     int\n\tmaxWordLength int\n}\n\n// New creates a new *Corpus\nfunc New() *Corpus {\n\tc := &Corpus{\n\t\twords:       make([]string, 0),\n\t\tfrequencies: make([]int, 0),\n\t\tids:         make(map[string]int),\n\t}\n\n\t// add some default words\n\tc.Add(\"\") // aka NULL - when there are no words\n\tc.Add(\"-UNKNOWN-\")\n\tc.Add(\"-ROOT-\")\n\tc.maxWordLength = 0 // specials don't have lengths\n\n\treturn c\n}\n\n// Construct creates a Corpus given the construction options. This allows for more flexibility\nfunc Construct(opts ...ConsOpt) (*Corpus, error) {\n\tc := new(Corpus)\n\n\t// checks\n\tif c.words == nil {\n\t\tc.words = make([]string, 0)\n\t}\n\tif c.frequencies == nil {\n\t\tc.frequencies = make([]int, 0)\n\t}\n\tif c.ids == nil {\n\t\tc.ids = make(map[string]int)\n\t}\n\n\tfor _, opt := range opts {\n\t\tif err := opt(c); err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t}\n\n\treturn c, nil\n}\n\n// ID returns the ID of a word and whether or not it was found in the corpus\nfunc (c *Corpus) Id(word string) (int, bool) {\n\tid, ok := c.ids[word]\n\treturn id, ok\n}\n\n// Word returns the word given the ID, and whether or not it was found in the corpus\nfunc (c *Corpus) Word(id int) (string, bool) {\n\tsize := atomic.LoadInt64(&c.maxid)\n\tmaxid := int(size)\n\n\tif id >= maxid {\n\t\treturn \"\", false\n\t}\n\treturn c.words[id], true\n}\n\n// Add adds a word to the corpus and returns its ID. If a word was previously in the corpus, it merely updates the frequency count and returns the ID\nfunc (c *Corpus) Add(word string) int {\n\tif id, ok := c.ids[word]; ok {\n\t\tc.frequencies[id]++\n\t\tc.totalFreq++\n\t\treturn id\n\t}\n\n\tid := atomic.AddInt64(&c.maxid, 1)\n\tc.ids[word] = int(id - 1)\n\tc.words = append(c.words, word)\n\tc.frequencies = append(c.frequencies, 1)\n\tc.totalFreq++\n\n\truneCount := utf8.RuneCountInString(word)\n\tif runeCount > c.maxWordLength {\n\t\tc.maxWordLength = runeCount\n\t}\n\n\treturn int(id - 1)\n}\n\n// Size returns the size of the corpus.\nfunc (c *Corpus) Size() int {\n\tsize := atomic.LoadInt64(&c.maxid)\n\treturn int(size)\n}\n\n// WordFreq returns the frequency of the word. If the word wasn't in the corpus, it returns 0.\nfunc (c *Corpus) WordFreq(word string) int {\n\tid, ok := c.ids[word]\n\tif !ok {\n\t\treturn 0\n\t}\n\n\treturn c.frequencies[id]\n}\n\n// IDFreq returns the frequency of a word given an ID. If the word isn't in the corpus it returns 0.\nfunc (c *Corpus) IDFreq(id int) int {\n\tsize := atomic.LoadInt64(&c.maxid)\n\tmaxid := int(size)\n\n\tif id >= maxid {\n\t\treturn 0\n\t}\n\treturn c.frequencies[id]\n}\n\n// TotalFreq returns the total number of words ever seen by the corpus. This number includes the count of repeat words.\nfunc (c *Corpus) TotalFreq() int {\n\treturn c.totalFreq\n}\n\n// MaxWordLength returns the length of the longest known word in the corpus.\nfunc (c *Corpus) MaxWordLength() int {\n\treturn c.maxWordLength\n}\n\n// WordProb returns the probability of a word appearing in the corpus.\nfunc (c *Corpus) WordProb(word string) (float64, bool) {\n\tid, ok := c.Id(word)\n\tif !ok {\n\t\treturn 0, false\n\t}\n\n\tcount := c.frequencies[id]\n\treturn float64(count) / float64(c.totalFreq), true\n\n}\n\n// Merge combines two corpuses. The receiver is the one that is mutated.\nfunc (c *Corpus) Merge(other *Corpus) {\n\tfor i, word := range other.words {\n\t\tfreq := other.frequencies[i]\n\t\tif id, ok := c.ids[word]; ok {\n\t\t\tc.frequencies[id] += freq\n\t\t\tc.totalFreq += freq\n\t\t} else {\n\t\t\tid := c.Add(word)\n\t\t\tc.frequencies[id] += freq - 1\n\t\t\tc.totalFreq += freq - 1\n\t\t}\n\t}\n}\n\n// Replace replaces the content of a word. The old reference remains.\n//\n// e.g: c.Replace(\"foo\", \"bar\")\n// c.Id(\"foo\") will still return a ID. The ID will be the same as c.Id(\"bar\")\nfunc (c *Corpus) Replace(a, with string) error {\n\told, ok := c.ids[a]\n\tif !ok {\n\t\treturn errors.Errorf(\"Cannot replace %q with %q. %q is not found\", a, with, a)\n\t}\n\tif _, ok := c.ids[with]; ok {\n\t\treturn errors.Errorf(\"Cannot replace %q with %q. %q exists in the corpus\", a, with, with)\n\t}\n\tc.words[old] = with\n\treturn nil\n\n}\n\n// ReplaceWord replaces the word associated with the given ID. The old reference remains.\nfunc (c *Corpus) ReplaceWord(id int, with string) error {\n\tif id >= len(c.words) {\n\t\treturn errors.Errorf(\"Cannot replace word with ID %d. Out of bounds.\", id)\n\t}\n\tif _, ok := c.ids[with]; ok {\n\t\treturn errors.Errorf(\"Cannot replace word with ID %d with %q. %q exists in the corpus\", id, with, with)\n\t}\n\tc.words[id] = with\n\treturn nil\n}\n"
  },
  {
    "path": "corpus/corpus_test.go",
    "content": "package corpus\n\nimport (\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestCorpus(t *testing.T) {\n\tassert := assert.New(t)\n\tdict := New()\n\tassert.Equal(0, dict.WordFreq(\"hello\")) // frequency of a word not in dict ould have to be 0\n\tassert.Equal(0, dict.IDFreq(3))         // ditto\n\n\tid := dict.Add(\"hello\")\n\n\tassert.Equal(3, id)\n\tassert.Equal([]string{\"\", \"-UNKNOWN-\", \"-ROOT-\", \"hello\"}, dict.words)\n\tassert.Equal(map[string]int{\"\": 0, \"-UNKNOWN-\": 1, \"-ROOT-\": 2, \"hello\": 3}, dict.ids)\n\tassert.Equal(4, dict.Size())\n\n\tid2, ok := dict.Id(\"hello\")\n\tif !ok {\n\t\tt.Errorf(\"The ID of null should be  0\")\n\t}\n\tassert.Equal(id, id2)\n\n\tword, ok := dict.Word(3)\n\tif !ok {\n\t\tt.Errorf(\"Expected word of ID 3 to be found\")\n\t}\n\tassert.Equal(\"hello\", word)\n\n\tdict.Add(word)\n\tassert.Equal(2, dict.WordFreq(word))\n\tassert.Equal(2, dict.IDFreq(3))\n\tassert.Equal(5, dict.TotalFreq())\n\tassert.Equal(5, dict.MaxWordLength())\n\n\tprob, ok := dict.WordProb(word)\n\tif !ok {\n\t\tt.Errorf(\"Expected a probability\")\n\t}\n\tassert.Equal(0.4, prob)\n\t// t.Logf(\"%q: %v\", word, dict.WordProb(word))\n}\n\nfunc TestCorpus_Merge(t *testing.T) {\n\tassert := assert.New(t)\n\n\tdict := New()\n\tid := dict.Add(\"hello\")\n\tdict.frequencies[id] += 4 // freq for \"hello\" is 5\n\tdict.totalFreq += 4\n\n\tother := New()\n\tid = other.Add(\"hello\")\n\tother.frequencies[id] += 2 // freq for \"hello\" is 3\n\tother.totalFreq += 2\n\tid = other.Add(\"world\")\n\tother.frequencies[id] += 1\n\tother.totalFreq += 1\n\n\tdict.Merge(other)\n\n\tassert.Equal(8, dict.WordFreq(\"hello\"))\n\tassert.Equal(2, dict.WordFreq(\"world\"))\n}\n"
  },
  {
    "path": "corpus/functions.go",
    "content": "package corpus\n\nimport (\n\t\"math\"\n\t\"strings\"\n\t\"unicode/utf8\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/treebank\"\n\t\"github.com/pkg/errors\"\n)\n\n// GenerateCorpus creates a Corpus given a set of SentenceTag from a training set.\nfunc GenerateCorpus(sentenceTags []treebank.SentenceTag) *Corpus {\n\twords := make([]string, 3)\n\tfrequencies := make([]int, 3)\n\n\twords[0] = \"\"      // aka NULL, for when no word can be found\n\tfrequencies[0] = 0 // no word is never found\n\n\twords[1] = \"-UNKNOWN-\"\n\tfrequencies[1] = 0\n\n\twords[2] = \"-ROOT-\"\n\tfrequencies[2] = 1\n\n\tknownWords := make(map[string]int)\n\tknownWords[\"\"] = 0\n\tknownWords[\"-UNKNOWN-\"] = 1\n\tknownWords[\"-ROOT-\"] = 2\n\n\tmaxWordLength := 0\n\n\tfor _, sentenceTag := range sentenceTags {\n\t\tfor _, lex := range sentenceTag.Sentence {\n\t\t\tid, ok := knownWords[lex.Value]\n\t\t\tif !ok {\n\t\t\t\tknownWords[lex.Value] = len(words)\n\t\t\t\twords = append(words, lex.Value)\n\t\t\t\tfrequencies = append(frequencies, 1)\n\n\t\t\t\truneCount := utf8.RuneCountInString(lex.Value)\n\t\t\t\tif runeCount > maxWordLength {\n\t\t\t\t\tmaxWordLength = runeCount\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\tfrequencies[id]++\n\t\t\t}\n\t\t}\n\t}\n\n\tvar totals int\n\tfor _, f := range frequencies {\n\t\ttotals += f\n\t}\n\n\treturn &Corpus{words, frequencies, knownWords, int64(len(words)), totals, maxWordLength}\n}\n\n// ViterbiSplit is a Viterbi algorithm for splitting words given a corpus\nfunc ViterbiSplit(input string, c *Corpus) []string {\n\ts := strings.ToLower(input)\n\tprobabilities := []float64{1.0}\n\tlasts := []int{0}\n\n\trunes := []int{}\n\tfor i := range s {\n\t\trunes = append(runes, i)\n\t}\n\trunes = append(runes, len(s)+1)\n\n\tfor i := range s {\n\t\tprobs := make([]float64, 0)\n\t\tls := make([]int, 0)\n\n\t\t// m := maxInt(0, i-c.maxWordLength)\n\n\t\tfor j, r := range runes {\n\t\t\tif r > i {\n\t\t\t\tbreak\n\t\t\t}\n\n\t\t\tp, ok := c.WordProb(s[r : i+1])\n\t\t\tif !ok {\n\t\t\t\t// http://stackoverflow.com/questions/195010/how-can-i-split-multiple-joined-words#comment48879458_481773\n\t\t\t\tp = (math.Log(float64(1)/float64(c.totalFreq)) - float64(c.maxWordLength) - float64(1)) * float64(i-r) // note it should be i-r not j-i as per the SO post\n\t\t\t}\n\t\t\tprob := probabilities[j] * p\n\n\t\t\tprobs = append(probs, prob)\n\t\t\tls = append(ls, r)\n\t\t}\n\n\t\tmaxProb := -math.SmallestNonzeroFloat64\n\t\tmaxK := -1 << 63\n\t\tfor j, p := range probs {\n\t\t\tif p > maxProb {\n\t\t\t\tmaxProb = p\n\t\t\t\tmaxK = ls[j]\n\t\t\t}\n\t\t}\n\t\tprobabilities = append(probabilities, maxProb)\n\t\tlasts = append(lasts, maxK)\n\t}\n\n\twords := make([]string, 0)\n\ti := utf8.RuneCountInString(s)\n\n\tfor i > 0 {\n\t\tstart := lasts[i]\n\t\twords = append(words, s[start:i])\n\t\ti = start\n\t}\n\n\t// reverse it\n\tfor i, j := 0, len(words)-1; i < j; i, j = i+1, j-1 {\n\t\twords[i], words[j] = words[j], words[i]\n\t}\n\n\treturn words\n}\n\n// CosineSimilarity measures the cosine similarity of two strings.\nfunc CosineSimilarity(a, b []string) float64 {\n\tcountsA := make([]float64, 0)\n\tcountsB := make([]float64, 0)\n\tuniques := make(map[string]int)\n\n\t// index the strings first\n\tfor _, st := range a {\n\t\ts := strings.ToLower(st)\n\t\tid, ok := uniques[s]\n\t\tif !ok {\n\t\t\tuniques[s] = len(countsA)\n\t\t\tcountsA = append(countsA, 1)\n\t\t\tcountsB = append(countsB, 0) // create for countsB, but don't add\n\t\t} else {\n\t\t\tcountsA[id]++\n\t\t}\n\t}\n\n\tfor _, st := range b {\n\t\ts := strings.ToLower(st)\n\t\tid, ok := uniques[s]\n\t\tif !ok {\n\t\t\tuniques[s] = len(countsA)\n\t\t\tcountsA = append(countsA, 0)\n\t\t\tcountsB = append(countsB, 1)\n\t\t} else {\n\t\t\tcountsB[id]++\n\t\t}\n\t}\n\n\tmagA, err := mag(countsA)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\tmagB, err := mag(countsB)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\tdotProd, err := dot(countsA, countsB)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\treturn dotProd / (magA * magB)\n\n}\n\n// DamerauLevenshtein calculates the Damerau-Levensthtein distance between two strings. See more at https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance\nfunc DamerauLevenshtein(s1 string, s2 string) (distance int) {\n\t// index by code point, not byte\n\tr1 := []rune(s1)\n\tr2 := []rune(s2)\n\n\t// the maximum possible distance\n\tinf := len(r1) + len(r2)\n\n\t// if one string is blank, we needs insertions\n\t// for all characters in the other one\n\tif len(r1) == 0 {\n\t\treturn len(r2)\n\t}\n\n\tif len(r2) == 0 {\n\t\treturn len(r1)\n\t}\n\n\t// construct the edit-tracking matrix\n\tmatrix := make([][]int, len(r1))\n\tfor i := range matrix {\n\t\tmatrix[i] = make([]int, len(r2))\n\t}\n\n\t// seen characters\n\tseenRunes := make(map[rune]int)\n\n\tif r1[0] != r2[0] {\n\t\tmatrix[0][0] = 1\n\t}\n\n\tseenRunes[r1[0]] = 0\n\tfor i := 1; i < len(r1); i++ {\n\t\tdeleteDist := matrix[i-1][0] + 1\n\t\tinsertDist := (i+1)*1 + 1\n\t\tvar matchDist int\n\t\tif r1[i] == r2[0] {\n\t\t\tmatchDist = i\n\t\t} else {\n\t\t\tmatchDist = i + 1\n\t\t}\n\t\tmatrix[i][0] = minInt(minInt(deleteDist, insertDist), matchDist)\n\t}\n\n\tfor j := 1; j < len(r2); j++ {\n\t\tdeleteDist := (j + 1) * 2\n\t\tinsertDist := matrix[0][j-1] + 1\n\t\tvar matchDist int\n\t\tif r1[0] == r2[j] {\n\t\t\tmatchDist = j\n\t\t} else {\n\t\t\tmatchDist = j + 1\n\t\t}\n\n\t\tmatrix[0][j] = minInt(minInt(deleteDist, insertDist), matchDist)\n\t}\n\n\tfor i := 1; i < len(r1); i++ {\n\t\tvar maxSrcMatchIndex int\n\t\tif r1[i] == r2[0] {\n\t\t\tmaxSrcMatchIndex = 0\n\t\t} else {\n\t\t\tmaxSrcMatchIndex = -1\n\t\t}\n\n\t\tfor j := 1; j < len(r2); j++ {\n\t\t\tswapIndex, ok := seenRunes[r2[j]]\n\t\t\tjSwap := maxSrcMatchIndex\n\t\t\tdeleteDist := matrix[i-1][j] + 1\n\t\t\tinsertDist := matrix[i][j-1] + 1\n\t\t\tmatchDist := matrix[i-1][j-1]\n\t\t\tif r1[i] != r2[j] {\n\t\t\t\tmatchDist += 1\n\t\t\t} else {\n\t\t\t\tmaxSrcMatchIndex = j\n\t\t\t}\n\n\t\t\t// for transpositions\n\t\t\tvar swapDist int\n\t\t\tif ok && jSwap != -1 {\n\t\t\t\tiSwap := swapIndex\n\t\t\t\tvar preSwapCost int\n\t\t\t\tif iSwap == 0 && jSwap == 0 {\n\t\t\t\t\tpreSwapCost = 0\n\t\t\t\t} else {\n\t\t\t\t\tpreSwapCost = matrix[maxInt(0, iSwap-1)][maxInt(0, jSwap-1)]\n\t\t\t\t}\n\t\t\t\tswapDist = i + j + preSwapCost - iSwap - jSwap - 1\n\t\t\t} else {\n\t\t\t\tswapDist = inf\n\t\t\t}\n\t\t\tmatrix[i][j] = minInt(minInt(minInt(deleteDist, insertDist), matchDist), swapDist)\n\t\t}\n\t\tseenRunes[r1[i]] = i\n\t}\n\n\treturn matrix[len(r1)-1][len(r2)-1]\n}\n\n// LongestCommonPrefix takes a slice of strings, and finds the longest common prefix\nfunc LongestCommonPrefix(strs ...string) string {\n\tswitch len(strs) {\n\tcase 0:\n\t\treturn \"\" // idiots\n\tcase 1:\n\t\treturn strs[0]\n\t}\n\n\tmin := strs[0]\n\tmax := strs[0]\n\n\tfor _, s := range strs[1:] {\n\t\tswitch {\n\t\tcase s < min:\n\t\t\tmin = s\n\t\tcase s > max:\n\t\t\tmax = s\n\t\t}\n\t}\n\n\tfor i := 0; i < len(min) && i < len(max); i++ {\n\t\tif min[i] != max[i] {\n\t\t\treturn min[:i]\n\t\t}\n\t}\n\n\t// In the case where lengths are not equal but all bytes\n\t// are equal, min is the answer (\"foo\" < \"foobar\").\n\treturn min\n}\n\n/* The following two functions help in parsing a string into numbers. It's recommended you write abstractions over the functions*/\n\n// StrsToInts converts a string slice into an int slice, with the help of NumberWords.\n// The function assumes all helper words like \"and\" have been stripped.\n// \t\t\"One hundred and five\" -> []string{\"one\", \"hundred\", \"five\"}\n// This is a very primitive method, and doesn't take into account other words like \"a hundred\" or \"a couple of hundred\"\nfunc StrsToInts(strs []string) (retVal []int, err error) {\n\tfor _, s := range strs {\n\t\tintVal, ok := lingo.NumberWords[s]\n\t\tif !ok {\n\t\t\treturn nil, errors.Errorf(\"Unable to parse the words %q as numbers\", s)\n\t\t}\n\n\t\tif len(retVal) > 0 && intVal == 100 && retVal[len(retVal)-1] < 100 {\n\t\t\tretVal[len(retVal)-1] *= 100\n\t\t} else if len(retVal) > 0 && retVal[len(retVal)-1] < 1000 && intVal < 1000 {\n\t\t\tretVal[len(retVal)-1] += intVal\n\t\t} else {\n\t\t\tretVal = append(retVal, intVal)\n\t\t}\n\t}\n\treturn\n}\n\n// CombineInts takes a int slice, and tries to make it one integer.\n// It works by taking advantage of english - anything more than 1000 has a repeated pattern\n// e.g.\n// \t\tone hundred and fifty thousand two hundred and two\n// there are 2 repeated patterns (one hundred and fifty) and  (two hundred and two)\n//\n// This allows us to repeatedly combine by addition or multiplication until there is one left\nfunc CombineInts(ints []int) int {\n\tvar total int\n\tfor len(ints) > 0 {\n\t\tif len(ints) == 1 || ints[0] >= 1000 {\n\t\t\tlast := ints[len(ints)-1]\n\t\t\ttotal += last\n\t\t\tints = ints[0 : len(ints)-1] //pop it\n\t\t} else {\n\t\t\tif ints[1] < 1000 {\n\t\t\t\t// something went wrong\n\t\t\t\tpanic(\"HELP!\")\n\t\t\t}\n\t\t\ttotal += ints[0] * ints[1]\n\t\t\tints = ints[2:]\n\t\t}\n\t}\n\treturn total\n}\n"
  },
  {
    "path": "corpus/functions_test.go",
    "content": "package corpus\n\nimport (\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc Test_GenerateCorpus(t *testing.T) {\n\tsentenceTags := mediumSentence()\n\tdict := GenerateCorpus(sentenceTags)\n\n\t// testing time\n\tassert := assert.New(t)\n\texpectedWords := []string{\"\", \"-UNKNOWN-\", \"-ROOT-\", \"President\", \"Bush\", \"on\", \"Tuesday\", \"nominated\", \"two\", \"individuals\", \"to\", \"replace\", \"retiring\", \"jurists\", \"federal\", \"courts\", \"in\", \"the\", \"Washington\", \"area\", \".\"}\n\n\texpectedIDs := make(map[string]int)\n\tfor i, w := range expectedWords {\n\t\texpectedIDs[w] = i\n\t}\n\n\tassert.Equal(expectedWords, dict.words, \"Corpus known words should be the same as the manually annotated expected values\")\n\tassert.Equal(expectedIDs, dict.ids, \"IDs should be the same as expected IDs\")\n\tassert.Equal(int64(len(expectedWords)), dict.maxid)\n}\n\nfunc TestViterbiSplit(t *testing.T) {\n\tassert := assert.New(t)\n\tdict := GenerateCorpus(mediumSentence())\n\n\ts2 := \"twoindividuals\"\n\twords := ViterbiSplit(s2, dict)\n\tassert.Equal([]string{\"two\", \"individuals\"}, words)\n\n\ts2 = \"FederalCourts\"\n\twords = ViterbiSplit(s2, dict)\n\tassert.Equal([]string{\"federal\", \"courts\"}, words)\n\n\ts3 := \"toreplaceon\"\n\twords = ViterbiSplit(s3, dict)\n\tassert.Equal([]string{\"to\", \"replace\", \"on\"}, words)\n}\n\nfunc TestCosineSimilarity(t *testing.T) {\n\ta := strings.Split(\"This is a test of cosine similarity\", \" \")\n\tb := strings.Split(\"This is not a test of cosine similarity\", \" \")\n\n\ts1 := CosineSimilarity(a, a)\n\ts2 := CosineSimilarity(a, b)\n\n\tif !floatEquals64(s1, 1) {\n\t\tt.Error(\"Expected similarity to be 1 when compared with itself\")\n\t}\n\tif s2 > s1 {\n\t\tt.Error(\"Something went wrong with the cosine similarity algorithm\")\n\t}\n\n\tc := strings.Split(\"Parramatta Road\", \" \")\n\td := strings.Split(\"Parramatta Rd\", \" \")\n\n\ts1 = CosineSimilarity(c, c)\n\ts2 = CosineSimilarity(c, d)\n\n\tif !floatEquals64(s1, 1) {\n\t\tt.Error(\"Expected similarity to be 1 when compared with itself\")\n\t}\n\tif s2 > s1 {\n\t\tt.Error(\"Something went wrong with the cosine similarity algorithm\")\n\t}\n}\n\nfunc TestDL(t *testing.T) {\n\ta := \"This is a test of Damerau Levenshtein\"\n\tb := \"This is not a test of Damerau Levenshtein\"\n\n\ts1 := DamerauLevenshtein(a, a)\n\ts2 := DamerauLevenshtein(a, b)\n\tif s1 != 0 {\n\t\tt.Errorf(\"Expected the distance to be 0 when compared against itself. Got %d\", s1)\n\t}\n\n\tif s2 < s1 {\n\t\tt.Error(\"Expected DL similarity to be greater when compared against itself\")\n\t}\n\n\tc := \"Parramatta Road\"\n\td := \"Paramatta Rd\"\n\n\ts1 = DamerauLevenshtein(c, c)\n\ts2 = DamerauLevenshtein(c, d)\n\n\tif s1 != 0 {\n\t\tt.Errorf(\"Expected the distance to be 0 when compared against itself. Got %d\", s1)\n\t}\n\tif s2 < s1 {\n\t\tt.Error(\"Expected DL similarity to be greater when compared against itself\")\n\t}\n}\n\nfunc TestLCP(t *testing.T) {\n\tassert := assert.New(t)\n\tlcp := LongestCommonPrefix(\"Hello World\", \"Hell yeah!\")\n\tassert.Equal(\"Hell\", lcp)\n\n\tlcp = LongestCommonPrefix(\"Hello World\", \"Hell yeah!\", \"hey there\")\n\tassert.Equal(\"\", lcp)\n\n\tlcp = LongestCommonPrefix()\n\tassert.Equal(\"\", lcp)\n\n\tlcp = LongestCommonPrefix(\"OneWord\")\n\tassert.Equal(\"OneWord\", lcp)\n\n\tlcp = LongestCommonPrefix(\"foo\", \"foobar\")\n\tassert.Equal(\"foo\", lcp)\n}\n\nvar parseNumTests = []struct {\n\ts string\n\tv int\n}{\n\t{\"twenty nine\", 29},\n\t{\"one hundred five\", 105},\n\t{\"five hundred twenty thousand twenty one\", 520021},\n}\n\nfunc TestParseNumber(t *testing.T) {\n\tfor _, pnts := range parseNumTests {\n\t\ts := strings.Split(pnts.s, \" \")\n\t\tints, err := StrsToInts(s)\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t\tcontinue\n\t\t}\n\n\t\tv := CombineInts(ints)\n\t\tif v != pnts.v {\n\t\t\tt.Errorf(\"Expected %q to be parsed to %d. Got %d instead\", pnts.s, pnts.v, v)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "corpus/inflection.go",
    "content": "package corpus\n\nimport (\n\t\"regexp\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\ntype conversionPattern struct {\n\tpattern     *regexp.Regexp\n\treplacement string\n}\n\nfunc newConversionPattern(from, to string) conversionPattern {\n\trFrom := regexp.MustCompile(from)\n\treturn conversionPattern{rFrom, to}\n}\n\n// plural -> singular\nvar plural = []conversionPattern{\n\tnewConversionPattern(\"(quiz)$\", \"${1}zes\"),\n\tnewConversionPattern(\"^(ox)$\", \"${1}en\"),\n\tnewConversionPattern(\"([m|l])ouse$\", \"${1}ice\"),\n\tnewConversionPattern(\"(matr|vert|ind)ix|ex$\", \"${1}ices\"),\n\tnewConversionPattern(\"(x|ch|ss|sh)$\", \"${1}es\"),\n\tnewConversionPattern(\"([^aeiouy]|qu)ies$\", \"${1}y\"),\n\tnewConversionPattern(\"([^aeiouy]|qu)y$\", \"${1}ies\"),\n\tnewConversionPattern(\"(hive)$\", \"${1}s\"),\n\tnewConversionPattern(\"(?:([^f])fe|([lr])f)$\", \"${1}${2}ves\"),\n\tnewConversionPattern(\"sis$\", \"ses\"),\n\tnewConversionPattern(\"([ti])um$\", \"${1}a\"),\n\tnewConversionPattern(\"(buffal|tomat|potat)o$\", \"${1}oes\"),\n\tnewConversionPattern(\"(bu)s$\", \"${1}ses\"),\n\tnewConversionPattern(\"(alias|status|sex)$\", \"${1}es\"),\n\tnewConversionPattern(\"(octop|vir)us$\", \"${1}i\"),\n\tnewConversionPattern(\"(ax|test)is$\", \"${1}es\"),\n\tnewConversionPattern(\"s$\", \"s\"),\n\tnewConversionPattern(\"$\", \"s\"),\n}\n\n// singular -> plural\nvar singular = []conversionPattern{\n\tnewConversionPattern(\"(quiz)zes$\", \"${1}\"),\n\tnewConversionPattern(\"(matr)ices$\", \"${1}ix\"),\n\tnewConversionPattern(\"(vert|ind)ices$\", \"${1}ex\"),\n\tnewConversionPattern(\"^(ox)en\", \"${1}\"),\n\tnewConversionPattern(\"(alias|status)es$\", \"${1}\"),\n\tnewConversionPattern(\"(octop|vir)i$\", \"${1}us\"),\n\tnewConversionPattern(\"(cris|ax|test)es$\", \"${1}is\"),\n\tnewConversionPattern(\"(shoe)s$\", \"${1}\"),\n\tnewConversionPattern(\"(o)es$\", \"${1}\"),\n\tnewConversionPattern(\"(bus)es$\", \"${1}\"),\n\tnewConversionPattern(\"([m|l])ice$\", \"${1}ouse\"),\n\tnewConversionPattern(\"(x|ch|ss|sh)es$\", \"${1}\"),\n\tnewConversionPattern(\"(m)ovies$\", \"${1}ovie\"),\n\tnewConversionPattern(\"(s)eries$\", \"${1}eries\"),\n\tnewConversionPattern(\"([^aeiouy]|qu)ies$\", \"${1}y\"),\n\tnewConversionPattern(\"([lr])ves$\", \"${1}f\"),\n\tnewConversionPattern(\"(tive)s$\", \"${1}\"),\n\tnewConversionPattern(\"(hive)s$\", \"${1}\"),\n\tnewConversionPattern(\"([^f])ves$\", \"${1}fe\"),\n\tnewConversionPattern(\"(^analy)ses$\", \"${1}sis\"),\n\tnewConversionPattern(\"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$\", \"${1}${2}sis\"),\n\tnewConversionPattern(\"([ti])a$\", \"${1}um\"),\n\tnewConversionPattern(\"(n)ews$\", \"${1}ews\"),\n\tnewConversionPattern(\"s$\", \"\"),\n}\n\n// weird pluralizations that don't match the rules above\nvar irregular = []conversionPattern{\n\tnewConversionPattern(\"person\", \"people\"),\n\tnewConversionPattern(\"man\", \"men\"),\n\tnewConversionPattern(\"child\", \"children\"),\n\tnewConversionPattern(\"sex\", \"sexes\"),\n\tnewConversionPattern(\"move\", \"moves\"),\n\tnewConversionPattern(\"sleeve\", \"sleeves\"),\n\tnewConversionPattern(\"datum\", \"data\"),\n\tnewConversionPattern(\"box\", \"boxes\"),\n\tnewConversionPattern(\"knife\", \"knives\"),\n}\n\nvar unconvertable = []string{\n\t\"equipment\",\n\t\"information\",\n\t\"rice\",\n\t\"money\",\n\t\"species\",\n\t\"series\",\n\t\"fish\",\n\t\"sheep\",\n}\n\n// Pluralize pluralizes words based on rules known\nfunc Pluralize(word string) string {\n\tif lingo.InStringSlice(word, unconvertable) {\n\t\treturn word\n\t}\n\n\tfor _, cp := range irregular {\n\t\tif cp.pattern.MatchString(word) {\n\t\t\treturn cp.replacement\n\t\t}\n\t}\n\n\tfor _, cp := range plural {\n\t\tif cp.pattern.MatchString(word) {\n\t\t\t// log.Printf(\"\\t%q Matches %q\", word, cp.pattern.String())\n\t\t\treturn cp.pattern.ReplaceAllString(word, cp.replacement)\n\t\t}\n\t}\n\treturn word\n}\n\n// Singularize singularizes words based on rules known\nfunc Singularize(word string) string {\n\tif lingo.InStringSlice(word, unconvertable) {\n\t\treturn word\n\t}\n\n\tfor _, cp := range singular {\n\t\tif cp.pattern.MatchString(word) {\n\t\t\treturn cp.pattern.ReplaceAllString(word, cp.replacement)\n\t\t}\n\t}\n\treturn word\n}\n"
  },
  {
    "path": "corpus/inflection_test.go",
    "content": "package corpus\n\nimport \"testing\"\n\nvar pluralizeTest = []struct {\n\tword, correct string\n}{\n\t{\"friend\", \"friends\"},\n\t{\"tomato\", \"tomatoes\"},\n\t{\"knife\", \"knives\"},\n\t{\"dwarf\", \"dwarves\"},\n\t{\"box\", \"boxes\"},\n\t{\"ox\", \"oxen\"},\n\t{\"man\", \"men\"},\n\t{\"equipment\", \"equipment\"},\n}\n\nvar singularizeTest = []struct {\n\tword, correct string\n}{\n\t{\"condolences\", \"condolence\"},\n\t{\"fish\", \"fish\"},\n\t{\"shoes\", \"shoe\"},\n\t{\"viri\", \"virus\"},\n\t{\"elves\", \"elf\"},\n}\n\nfunc TestPluralize(t *testing.T) {\n\tfor _, pts := range pluralizeTest {\n\t\tgot := Pluralize(pts.word)\n\t\tif got != pts.correct {\n\t\t\tt.Errorf(\"Pluralizing %q failed. Want %q. Got %q instead\", pts.word, pts.correct, got)\n\t\t}\n\t}\n}\n\nfunc TestSingularize(t *testing.T) {\n\tfor _, pts := range singularizeTest {\n\t\tgot := Singularize(pts.word)\n\t\tif got != pts.correct {\n\t\t\tt.Errorf(\"Singularizing %q failed. Want %q. Got %q instead\", pts.word, pts.correct, got)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "corpus/io.go",
    "content": "package corpus\n\nimport (\n\t\"bufio\"\n\t\"bytes\"\n\t\"encoding/gob\"\n\t\"io\"\n\t\"strconv\"\n\t\"strings\"\n)\n\n// sortutil is a utility struct meant to sort words based on IDs\ntype sortutil struct {\n\twords []string\n\tids   []int\n\tfreqs []int\n}\n\nfunc (s *sortutil) Len() int           { return len(s.words) }\nfunc (s *sortutil) Less(i, j int) bool { return s.ids[i] < s.ids[j] }\nfunc (s *sortutil) Swap(i, j int) {\n\ts.words[i], s.words[j] = s.words[j], s.words[i]\n\ts.ids[i], s.ids[j] = s.ids[j], s.ids[i]\n\tif len(s.freqs) > 0 {\n\t\ts.freqs[i], s.freqs[j] = s.freqs[j], s.freqs[i]\n\t}\n}\n\n// ToDictWithFreq returns a simple marshalable type. Conceptually it's a JSON object with the words as the keys. The values are a pair - ID and Freq.\nfunc ToDictWithFreq(c *Corpus) map[string]struct{ ID, Freq int } {\n\tretVal := make(map[string]struct{ ID, Freq int })\n\tfor i, w := range c.words {\n\t\tretVal[w] = struct{ ID, Freq int }{i, c.frequencies[i]}\n\t}\n\treturn retVal\n}\n\n// ToDict returns a marshalable dict. It returns a copy of the ID mapping.\nfunc ToDict(c *Corpus) map[string]int {\n\tretVal := make(map[string]int)\n\tfor k, v := range c.ids {\n\t\tretVal[k] = v\n\t}\n\treturn retVal\n}\n\n// GobEncode implements GobEncoder for *Corpus\nfunc (c *Corpus) GobEncode() ([]byte, error) {\n\tvar buf bytes.Buffer\n\tencoder := gob.NewEncoder(&buf)\n\n\tif err := encoder.Encode(c.words); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(c.ids); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(c.frequencies); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(c.maxid); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(c.totalFreq); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(c.maxWordLength); err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn buf.Bytes(), nil\n}\n\n// GobDecode implements GobDecoder for *Corpus\nfunc (c *Corpus) GobDecode(buf []byte) error {\n\tb := bytes.NewBuffer(buf)\n\tdecoder := gob.NewDecoder(b)\n\n\tif err := decoder.Decode(&c.words); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&c.ids); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&c.frequencies); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&c.maxid); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&c.totalFreq); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&c.maxWordLength); err != nil {\n\t\treturn err\n\t}\n\n\treturn nil\n}\n\n// LoadOneGram loads a 1_gram.txt file, which is a tab separated file which lists the frequency counts of words. Example:\n// \t\tthe\t23135851162\n// \t\tof\t13151942776\n// \t\tand\t12997637966\n// \t\tto\t12136980858\n// \t\ta\t9081174698\n// \t\tin\t8469404971\n// \t\tfor\t5933321709\nfunc (c *Corpus) LoadOneGram(r io.Reader) error {\n\tscanner := bufio.NewScanner(r)\n\tfor scanner.Scan() {\n\t\tline := scanner.Text()\n\t\tsplits := strings.Split(line, \"\\t\")\n\n\t\tif len(splits) == 0 {\n\t\t\tbreak\n\t\t}\n\n\t\tword := splits[0] // TODO: normalize\n\t\tcount, err := strconv.Atoi(splits[1])\n\t\tif err != nil {\n\t\t\treturn err\n\t\t}\n\n\t\tid := c.Add(word)\n\t\tc.frequencies[id] = count\n\t\tc.totalFreq--\n\t\tc.totalFreq += count\n\n\t\twc := len([]rune(word))\n\t\tif wc > c.maxWordLength {\n\t\t\tc.maxWordLength = wc\n\t\t}\n\t}\n\treturn nil\n}\n"
  },
  {
    "path": "corpus/io_test.go",
    "content": "package corpus\n\nimport (\n\t\"bytes\"\n\t\"encoding/gob\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestCorpusGob(t *testing.T) {\n\tbuf := new(bytes.Buffer)\n\n\tc := New()\n\tc.Add(\"Hello\")\n\tc.Add(\"World\")\n\n\thelloID, _ := c.Id(\"Hello\")\n\tworldID, _ := c.Id(\"World\")\n\n\tencoder := gob.NewEncoder(buf)\n\tdecoder := gob.NewDecoder(buf)\n\n\tif err := encoder.Encode(c); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tc2 := New()\n\tif err := decoder.Decode(c2); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tif hid, ok := c2.Id(\"Hello\"); !ok || (ok && hid != helloID) {\n\t\tt.Errorf(\"\\\"Hello\\\" not found after decoding.\")\n\t}\n\n\tif wid, ok := c2.Id(\"World\"); !ok || (ok && wid != worldID) {\n\t\tt.Errorf(\"\\\"World\\\" not found after decoding.\")\n\t}\n}\n\nfunc TestCorpusToDict(t *testing.T) {\n\tassert := assert.New(t)\n\tc, _ := Construct(WithWords([]string{\"World\", \"Hello\", \"World\"}))\n\n\td := ToDict(c)\n\tc2, err := Construct(FromDict(d))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tassert.Equal(c.words, c2.words, \"Expected words to be the same\")\n\tassert.Equal(c.ids, c2.ids, \"Expected IDs to be the same\")\n\tassert.NotEqual(c.frequencies, c2.frequencies, \"Expected frequencies to not be the same\")\n\tassert.Equal(c.maxid, c2.maxid, \"Expected maxID to be the same\")\n\tassert.NotEqual(c.totalFreq, c2.totalFreq, \"Expected totalFreq to be different.\")\n\tassert.Equal(c.maxWordLength, c2.maxWordLength, \"Expected maxWordLength to be the same\")\n}\n\nfunc TestCorpusToDictWithFreq(t *testing.T) {\n\tassert := assert.New(t)\n\tc, _ := Construct(WithWords([]string{\"World\", \"Hello\", \"World\"}))\n\n\td := ToDictWithFreq(c)\n\tc2, err := Construct(FromDictWithFreq(d))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassert.Equal(c, c2)\n}\n\nfunc TestLoadOneGram(t *testing.T) {\n\tassert := assert.New(t)\n\tr := strings.NewReader(sample1Gram)\n\n\tc := New()\n\terr := c.LoadOneGram(r)\n\tassert.Nil(err)\n\tassert.Equal(10, c.Size())\n\n\tid, ok := c.Id(\"for\")\n\tif !ok {\n\t\tt.Errorf(\"Expected \\\"for\\\" to be in corpus after loading one gram file\")\n\t}\n\tassert.Equal(int(c.maxid-1), id)\n\n}\n"
  },
  {
    "path": "corpus/lda.go",
    "content": "package corpus\n\nimport (\n\t\"gorgonia.org/tensor\"\n)\n\n// LDAModel ... TODO\n//https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation\ntype LDAModel struct {\n\t// params\n\tAlpha tensor.Tensor // is a Row\n\tEta   tensor.Tensor // is a Col\n\t//Kappa gorgonia.Scalar // Decay\n\t//Tau0  gorgonia.Scalar // offset\n\n\t// parameters needed for working\n\tTopics      int\n\tChunkSize   int\n\tTerms       int\n\tUpdateEvery int\n\tEvalEvery   int\n\n\t// consts\n\tIterations     int\n\tGammaThreshold float64\n\n\tMinimumProb float64\n\n\t// track current progress\n\tUpdates int\n\n\t// type\n\tDtype tensor.Dtype\n}\n\nfunc (l *LDAModel) init() {\n\teta := tensor.New(tensor.Of(l.Dtype), tensor.WithShape(l.Topics))\n\talpha := tensor.New(tensor.Of(l.Dtype), tensor.WithShape(l.Topics))\n\n\tswitch l.Dtype {\n\tcase tensor.Float64:\n\t\tv := 1.0 / float64(l.Topics)\n\t\teta.Memset(v)\n\t\talpha.Memset(v)\n\tcase tensor.Float32:\n\t\tv := float32(1) / float32(l.Topics)\n\t\teta.Memset(v)\n\t\talpha.Memset(v)\n\t}\n\n\tl.Alpha = alpha\n\tl.Eta = eta\n}\n"
  },
  {
    "path": "corpus/test_test.go",
    "content": "package corpus\n\nimport (\n\t\"strings\"\n\n\t\"github.com/chewxy/lingo/treebank\"\n)\n\nconst sample1Gram = `the\t23135851162\nof\t13151942776\nand\t12997637966\nto\t12136980858\na\t9081174698\nin\t8469404971\nfor\t5933321709`\n\nfunc mediumSentence() []treebank.SentenceTag {\n\tconllu := `1\tPresident\tPresident\tPROPN\tNNP\tNumber=Sing\t2\tcompound\t_\t_\n2\tBush\tBush\tPROPN\tNNP\tNumber=Sing\t5\tnsubj\t_\t_\n3\ton\ton\tADP\tIN\t_\t4\tcase\t_\t_\n4\tTuesday\tTuesday\tPROPN\tNNP\tNumber=Sing\t5\tnmod\t_\t_\n5\tnominated\tnominate\tVERB\tVBD\tMood=Ind|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\ttwo\ttwo\tNUM\tCD\tNumType=Card\t7\tnummod\t_\t_\n7\tindividuals\tindividual\tNOUN\tNNS\tNumber=Plur\t5\tdobj\t_\t_\n8\tto\tto\tPART\tTO\t_\t9\tmark\t_\t_\n9\treplace\treplace\tVERB\tVB\tVerbForm=Inf\t5\tadvcl\t_\t_\n10\tretiring\tretire\tVERB\tVBG\tVerbForm=Ger\t11\tamod\t_\t_\n11\tjurists\tjurist\tNOUN\tNNS\tNumber=Plur\t9\tdobj\t_\t_\n12\ton\ton\tADP\tIN\t_\t14\tcase\t_\t_\n13\tfederal\tfederal\tADJ\tJJ\tDegree=Pos\t14\tamod\t_\t_\n14\tcourts\tcourt\tNOUN\tNNS\tNumber=Plur\t11\tnmod\t_\t_\n15\tin\tin\tADP\tIN\t_\t18\tcase\t_\t_\n16\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t18\tdet\t_\t_\n17\tWashington\tWashington\tPROPN\tNNP\tNumber=Sing\t18\tcompound\t_\t_\n18\tarea\tarea\tNOUN\tNN\tNumber=Sing\t14\tnmod\t_\t_\n19\t.\t.\tPUNCT\t.\t_\t5\tpunct\t_\t_\n\n`\n\n\treadr := strings.NewReader(conllu)\n\treturn treebank.ReadConllu(readr)\n}\n\nconst EPSILON64 float64 = 1e-10\n\nfunc floatEquals64(a, b float64) bool {\n\tif (a-b) < EPSILON64 && (b-a) < EPSILON64 {\n\t\treturn true\n\t}\n\treturn false\n}\n"
  },
  {
    "path": "corpus/utils.go",
    "content": "package corpus\n\nimport (\n\t\"errors\"\n\t\"math\"\n)\n\nfunc minInt(a, b int) int {\n\tif a < b {\n\t\treturn a\n\t}\n\treturn b\n}\n\nfunc maxInt(a, b int) int {\n\tif a > b {\n\t\treturn a\n\t}\n\treturn b\n}\n\nfunc dot(a, b []float64) (float64, error) {\n\tif len(a) != len(b) {\n\t\treturn 0, errors.New(\"Differing lengths!\")\n\t}\n\n\tvar retVal float64\n\tfor i, v := range a {\n\t\tretVal += v * b[i]\n\t}\n\treturn retVal, nil\n}\n\nfunc mag(a []float64) (float64, error) {\n\tdotProd, err := dot(a, a)\n\tif err != nil {\n\t\treturn dotProd, err\n\t}\n\treturn math.Sqrt(dotProd), nil\n}\n"
  },
  {
    "path": "dep/README.md",
    "content": "# Dependency Parser #\n\nPackage `dependencyparser` is a package that provides data structures and algorithms for a dependency parser as described by [Chen and Manning 2014](http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf) [PDF]. It achieves similar accuracy scores as the the cited paper.\n\n# Installing #\n\n`go get -u github.com/chewxy/lingo/dep`\n\n\n\n# How It Works #\n\n## Transition Based Parsing ##\n\nThe core of the parser is a transition based parser, as popularized by [Nivre 2003](https://stp.lingfil.uu.se/~nivre/docs/iwpt03.pdf) [PDF]. It's essentially a [shift-reduce parser](https://en.wikipedia.org/wiki/Shift-reduce_parser) with more states. Dan Jurafsky has a very [complete overview of transition-based parsing](https://web.stanford.edu/~jurafsky/slp3/14.pdf) [PDF], which should be consulted should more questions arise.\n\n### Transitions ###\n\nAt the core of a transition based parser are two data structures: a stack and a queue. The queue, or buffer holds a list of words waiting to be parsed. Parsing is then simply a matter of manipulating the state of the stack and queue. Specifically there are three possible actions in an arc-standard parser:\n\n* `Shift`: Shift simply shifts one word from the buffer on to the top of the stack\n* `Left`: Left means the top of the stack is the head of the word underneath it. After the transition is applied (the link between the nodes attached), the word underneath the stack is removed.\n* `Right`: Right means that the top of the stack is the child of the word underneath it. After the transition is applied, the top of the stack is popped.\n\nA word on the terms \"head\", and \"child\". Consider the sentence \"I am human\":\n\n![\"I am human\" example](https://github.com/chewxy/lingo/blob/master/dep/documentation/iamhuman.dot.png?raw=true)\n\nWe say \"human\" is the head of the words \"I\" and \"am\". Therefore, \"I\" and \"am\" are considered to be children of \"human\".\n\n### Example ###\n\nLet's look at a simple example to concrefy the ideas: \"The cat sat on the mat\". Here are the states\n\n| Step | Stack                         | Buffer                                    | Transition |\n|------|-------------------------------|-------------------------------------------|------------|\n|0 | [ROOT]                            | [\"The\", \"cat\", \"sat\", \"on\", \"the\", \"mat\"] | Shift      |\n|1 | [ROOT, \"The\"]                     | [\"cat\", \"sat\", \"on\", \"the\", \"mat\"]        | Shift      |\n|2 | [ROOT, \"The\", \"cat\"]              | [\"sat\", \"on\", \"the\", \"mat\"]               | Left       | \n|3 | [ROOT, \"cat\"]                     | [\"sat\", \"on\", \"the\", \"mat\"]               | Shift      |\n|4 | [ROOT, \"cat\", \"sat\"]              | [\"on\", \"the\", \"mat\"]                      | Left       |\n|5 | [ROOT, \"sat\"]                     | [\"on\", \"the\", \"mat\"]                      | Shift      |\n|6 | [ROOT, \"sat\", \"on\"]               | [\"the\", \"mat\"]                            | Shift      |\n|7 | [ROOT, \"sat\", \"on\", \"the\"]        | [\"mat\"]                                   | Shift      |\n|8 | [ROOT, \"sat\", \"on\", \"the\", \"mat\"] | []                                        | Left       |\n|9 | [ROOT, \"sat\", \"on\", \"mat\"]        | []                                        | Left       |\n|10| [ROOT, \"sat\", \"mat\"]              | []                                        | Right      |\n|11| [ROOT, \"sat\"]                     | []                                        | Left       |\n\nThe above transitions produces this parse tree:\n\n![\"the cat sat on the mat\"](https://github.com/chewxy/lingo/blob/master/dep/documentation/thecatsatonthemat.dot.png?raw=true)\n\nThe real question then is of course - how does the system know which is the correct transition to emit, given the state?\n\nThe answer is machine learning.\n\n## Machine Learning ##\n\nWhat exactly are we learning? Or more carefully put, what are the inputs and outputs of the machine learning algorithm? The table in the example above provides a template for the inputs and output. The output is easy - the transition is what we want to learn. \n\nAs for the input, it's a little bit more complex. The input consists of the stack and the buffer. It'd be impractical and slow to include everything in the stack and buffer (dynamic neural networks are somewhat slower than static ones). So Chen and Manning came up with an ingenious idea - \n\n* Use the top 3 words of the stack\n* Use the top 3 words of the buffer\n* Use the first and second leftmost/rightmost children of the first two words of the stack\n\nInstead of directly using the words, POS Tag and dependency relations as features, the rather ingenious idea was that it would use vectors drawn from an embedding matrix to represent these features instead. So instead of building sparse features, concatenating the vectors form a fixed sized input vector. This makes training the network much more expedient. \nYou'll find this in [features.go](https://github.com/chewxy/lingo/blob/master/dependencyParser/features.go)\n\nGiven each state above, it'd be fairly trivial to extract an input vector based on the 18 \"features\" listed and feed forwards to a neural network. The result is a fast parser.\n\n### Neural Network ###\n\nThe machine learning algorithm behind this parser is a simple 3-layered network. An input layer is constructed from the embedding matrices, and is forwarded to the first layer, which is activated by a cube activation function. This then passes forwards to a dropout layer before the last layer, which is a softmax layer.\n\n[image of NN] \n\n## Hairy Bits ##\n\nThe hairy bits of this is the oracle. Specifically, the question: given a training sentence, how do we generate correct examples such as the table above? \n\nTODO: finish writing this section\n\n\n# How To Use #\n\nThis package provides three main data structures for use:\n\n* `Parser`\n* `Model`\n* `Trainer`\n\n`Trainer` takes a `[]treebank.SentenceTag` and produces a `Model`. `Parser` requires a `Model` to run, and is basically a exported wrapper over `configuration` that handles a pipeline.\n\n## Basic NLP Pipeline ##\n\n```go\nfunc main() {\n\tinputString: `The cat sat on the mat`\n\tlx := lexer.New(\"dummy\", strings.NewReader(inputString)) // lexer - required to break a sentence up into words. \n\tpt := pos.New(pos.WithModel(posModel))                   // POS Tagger - required to tag the words with a part of speech tag.\n\tdp := dep.New(depModel)                                  // Creates a new parser\n\n\t// set up a pipeline\n\tpt.Input = lx.Output\n\tdp.Input = pt.Output\n\n\t// run all\n\tgo lx.Run()\n\tgo pt.Run()\n\tgo dp.Run()\n\n\t// wait to receive:\n\tfor {\n\t\tselect {\n\t\tcase d := <- dp.Output:\n\t\t\t// do something\n\t\tcase err:= <-dp.Error:\n\t\t\t// handle error\n\t\t}\n\t}\n\n}\n```\n\n## Training A Model ##\n\nTo train a model you'd use the `Trainer`. The trainer accepts a `[]treebank.SentenceTag`. As long as you can parse your training file into those (package `treebank` accepts CONLLU formatted files as well as the PennTreebank formatted files), you'd be fine.\n\nAn example trainer is in the cmd directory of `lingo`\n\n# FAQ #\n\n**Why not an LSTM or RNN to encode the state of the stack and buffer?**\n\nThe answer is simplicity and speed. I have attempted variants of the parser with different neural networks - they don't work as fast as this. I am aware of Parsey-McParseface and the slightly improved accuracy compared to this model, but the speed has been not as great as I expect. This package emphasises parsing speed over accuracy - for most well written English sentences, this package performs well.\n\n**Why are there no models?**\n\nI'm afraid you're gonna have to train your own models. Training takes days on the Universal Dependency dataset and I haven't had the time to train on those. All my models are specific to the use of the company, and hence cannot be released.\n\n**What caveats are there?**\n\nChen and Manning described using pre-computed activations for the top 10000 or so words. I did not implement that, but it would be trivial to revisit and implement it. Feel free to send a pull request.\n\n**How can this be sped up?**\n\nUse multiple, smaller trainers, each training on a separate batch. You can hence train them concurrently (pass the costs in a channel and collect at the end). At the end, sum the gradients before applying adagrad. The trade off is that a LOT more memory will be used. It's also the reason why it wasn't included as the default. It's quite trivial to write though. Send a pull request if you have managed to reduce memory usage.\n\n\n# Contributing #\n\nsee package lingo's CONTRIBUTING.md for more information. There is currently a list of issues in Github issues. Those are good places to start.\n\n# Licence #\n\nThis package is MIT licenced."
  },
  {
    "path": "dep/arcStandard.go",
    "content": "package dep\n\nimport \"github.com/chewxy/lingo\"\n\n// var SingleRoot bool = true // make this part of a build process\n\n// canApply checks if a particular transition can be applied\nfunc (c *configuration) canApply(t transition) bool {\n\n\tvar h head\n\tif t.Move == Left || t.Move == Right {\n\t\tif t.Move == Left {\n\t\t\th = c.stackValue(0)\n\t\t} else {\n\t\t\th = c.stackValue(1)\n\t\t}\n\n\t\tif h < 0 {\n\t\t\treturn false\n\t\t}\n\t\tif h == 0 && t.DependencyType != lingo.Root {\n\t\t\treturn false\n\t\t}\n\t}\n\n\tstackSize := c.stackSize()\n\tbufferSize := c.bufferSize()\n\n\tif t.Move == Left {\n\t\treturn stackSize > 2\n\t}\n\n\tif t.Move == Right {\n\t\treturn stackSize > 2 || (stackSize == 2 && bufferSize == 0)\n\n\t\t// if not single root build\n\t\t// return stackSize >= 2\n\t}\n\n\treturn bufferSize > 0 // strange other thing...\n\n}\n\n// apply applies the transition\nfunc (c *configuration) apply(t transition) {\n\tlogf(\"Applying %v\", t)\n\tw1 := int(c.stackValue(1))\n\tw2 := int(c.stackValue(0))\n\n\tif t.Move == Left {\n\t\tc.AddArc(w2, w1, t.DependencyType)\n\t\tc.removeSecondTopStack()\n\t} else if t.Move == Right {\n\t\tc.AddArc(w1, w2, t.DependencyType)\n\t\tc.removeTopStack()\n\t} else {\n\t\tc.shift()\n\t}\n}\n\n// oracle gets the gold transition given the state\nfunc (c *configuration) oracle(goldParse *lingo.Dependency) (t transition) {\n\tw1 := int(c.stackValue(1))\n\tw2 := int(c.stackValue(0))\n\n\tif w1 > 0 && goldParse.Head(w1) == w2 {\n\t\tt.Move = Left\n\t\tt.DependencyType = goldParse.Label(w1)\n\t\treturn\n\t} else if w1 >= 0 && goldParse.Head(w2) == w1 && !c.hasOtherChildren(w2, goldParse) {\n\t\tt.Move = Right\n\t\tt.DependencyType = goldParse.Label(w2)\n\n\t\treturn\n\t}\n\treturn // default transition is Shift\n}\n"
  },
  {
    "path": "dep/arcStandard_test.go",
    "content": "package dep\n\nimport (\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestCanApply(t *testing.T) {\n\tdep := simpleSentence()[0].Dependency(dummyFix{})\n\n\tbuffer := make([]head, 0)\n\tfor i := 1; i < dep.WordCount(); i++ {\n\t\tbuffer = append(buffer, head(i))\n\t}\n\n\tstack := []head{0}\n\n\tc := &configuration{\n\t\tDependency: dep,\n\t\tstack:      stack,\n\t\tbuffer:     buffer,\n\t}\n\n\tassert := assert.New(t)\n\n\tlogf(\"Start config: \\n%v\", c)\n\n\trootLeft := c.canApply(transition{Left, lingo.Root})\n\trootRight := c.canApply(transition{Right, lingo.Root})\n\tNSubjLeft := c.canApply(transition{Left, lingo.NSubj})\n\tNSubjRight := c.canApply(transition{Right, lingo.NSubj})\n\tShiftDep := c.canApply(transition{Shift, lingo.NoDepType})\n\n\tassert.Equal(false, rootLeft, \"rootLeft should be false\")\n\tassert.Equal(false, rootRight, \"rootRight should be false\")\n\tassert.Equal(false, NSubjLeft, \"NSubjLeft should be false\")\n\tassert.Equal(false, NSubjRight, \"NSubjRight should be false\")\n\tassert.Equal(true, ShiftDep, \"ShiftDep should be true\")\n\n\tlogf(\"rootRight: %v, rootLeft: %v\", rootLeft, rootRight)\n\tlogf(\"NSubjRight: %v, NSubjLeft: %v\", NSubjRight, NSubjLeft)\n\tlogf(\"ShiftDep: %v\", ShiftDep)\n\n\tc.shift()\n\tc.shift()\n\tlogf(\"%v\", c)\n\n\trootLeft = c.canApply(transition{Left, lingo.Root})\n\trootRight = c.canApply(transition{Right, lingo.Root})\n\tNSubjLeft = c.canApply(transition{Left, lingo.NSubj})\n\tNSubjRight = c.canApply(transition{Right, lingo.NSubj})\n\tShiftDep = c.canApply(transition{Shift, lingo.NoDepType})\n\n\tassert.Equal(true, rootLeft, \"rootLeft should be true\")\n\tassert.Equal(true, rootRight, \"rootRight should be true\")\n\tassert.Equal(true, NSubjLeft, \"NSubjLeft should be true\")\n\tassert.Equal(true, NSubjRight, \"NSubjRight should be true\")\n\tassert.Equal(true, ShiftDep, \"ShiftDep should be true\")\n\n\tlogf(\"rootRight: %v, rootLeft: %v\", rootLeft, rootRight)\n\tlogf(\"NSubjRight: %v, NSubjLeft: %v\", NSubjRight, NSubjLeft)\n\tlogf(\"ShiftDep: %v\", ShiftDep)\n}\n\nfunc TestOracle(t *testing.T) {\n\tst := simpleSentence()[0]\n\ts := st.AnnotatedSentence(nil)\n\tc := newConfiguration(s, true)\n\td := s.Dependency()\n\n\tfor count := 0; !c.isTerminal() && count < 100; count++ {\n\t\toracle := c.oracle(d)\n\n\t\tif !c.canApply(oracle) && (oracle != transition{Right, lingo.Root}) {\n\t\t\tt.Errorf(\"Cannot apply %v\", oracle)\n\t\t\tbreak\n\t\t}\n\n\t\tc.apply(oracle)\n\t}\n\n\tassert.Equal(t, d.Heads(), c.Heads())\n}\n"
  },
  {
    "path": "dep/configuration.go",
    "content": "package dep\n\nimport (\n\t\"fmt\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\n// describes the current state of the parser\n\ntype head int\n\nconst (\n\tDOES_NOT_EXIST head = iota - 1\n)\n\n// configuration is the meat of the shift-reduce parsing. It holds the state for the shift reduction\ntype configuration struct {\n\t*lingo.Dependency\n\tstack  []head\n\tbuffer []head\n\n\tbp int // buffer pointer - starts at 0, increments\n}\n\nfunc newConfiguration(sentence lingo.AnnotatedSentence, fromGold bool) *configuration {\n\tif fromGold {\n\t\tsentence = sentence.Clone()\n\t}\n\n\tdep := lingo.NewDependency(lingo.FromAnnotatedSentence(sentence), lingo.AllocTree())\n\tdep.SetID()\n\tsentence = sentence[1:] // because the POSTagger automatically adds a ROOTTAG at the end of it\n\n\tvar buffer []head\n\tfor i := 1; i <= len(sentence); i++ {\n\t\tbuffer = append(buffer, head(i))\n\t}\n\n\tvar stack []head\n\tstack = append(stack, head(0)) // add root\n\n\treturn &configuration{\n\t\tDependency: dep,\n\t\tstack:      stack,\n\t\tbuffer:     buffer,\n\t}\n}\n\nfunc (c *configuration) String() string {\n\treturn fmt.Sprintf(\"Stack: %v Buffer(%d): %v\", c.stack, c.bp, c.buffer[c.bp:])\n}\n\nfunc (c *configuration) GoString() string {\n\treturn fmt.Sprintf(\"Stack: %v Buffer(%d): %v\\nHeads: %v\\nRels: %v\\n\", c.stack, c.bp, c.buffer[c.bp:], c.Heads(), c.Labels())\n}\n\nfunc (c *configuration) bufferSize() int {\n\treturn len(c.buffer) - c.bp\n}\n\nfunc (c *configuration) stackSize() int {\n\treturn len(c.stack)\n}\n\nfunc (c *configuration) head(i int) head {\n\theads := c.Heads() // TODO: maybe some sanity checks?\n\treturn head(heads[i])\n}\n\n// gets the sentence index of the ith word on the stack. If there isn't anything on the stack, it returns DOES_NOT_EXIST\nfunc (c *configuration) stackValue(i int) head {\n\tsize := c.stackSize()\n\tif i >= size || i < 0 {\n\t\treturn DOES_NOT_EXIST\n\t}\n\treturn c.stack[size-1-i]\n}\n\nfunc (c *configuration) bufferValue(i int) head {\n\tsize := c.bufferSize()\n\tif i >= size {\n\t\treturn DOES_NOT_EXIST\n\t}\n\treturn c.buffer[i+c.bp]\n}\n\n/*  stack machinations */\n\n// pop pops the stack. It isn't really used any more. removeStack(), removeTopStack() and removeSecondTopStack() has superseded its function\nfunc (c *configuration) pop() head {\n\tretVal := c.stack[len(c.stack)-1]\n\tc.stack = c.stack[0 : len(c.stack)-1]\n\treturn retVal\n}\n\n// removes a value from the stack.\nfunc (c *configuration) removeStack(i int) {\n\tc.stack = c.stack[:i+copy(c.stack[i:], c.stack[i+1:])]\n}\n\n// removeSecondTopStack removes the 2nd-to-last element\nfunc (c *configuration) removeSecondTopStack() bool {\n\tstackSize := c.stackSize()\n\tif stackSize < 2 {\n\t\treturn false\n\t}\n\ti := stackSize - 2\n\tc.removeStack(i)\n\treturn true\n}\n\nfunc (c *configuration) removeTopStack() bool {\n\tstackSize := c.stackSize()\n\tif stackSize < 1 {\n\t\treturn false\n\t}\n\ti := stackSize - 1\n\tc.removeStack(i)\n\treturn true\n}\n\n/* Dependency related stuff */\n\nfunc (c *configuration) label(i head) lingo.DependencyType {\n\tif i < 0 {\n\t\treturn lingo.NoDepType\n\t}\n\n\tif i == 0 {\n\t\treturn lingo.NoDepType\n\t}\n\n\treturn c.Label(int(i))\n\t// i--\n\n\t// labels := c.Labels()\n\t// return labels[i]\n}\n\nfunc (c *configuration) annotation(i head) *lingo.Annotation {\n\tif i < 0 {\n\t\treturn lingo.NullAnnotation()\n\t}\n\n\tif i == 0 {\n\t\treturn lingo.RootAnnotation()\n\t}\n\t// i--\n\n\treturn c.Annotation(int(i))\n\n\t// return c.Sentence()[i]\n}\n\n// gets the jth left child of the ith word of a sentence\nfunc (c *configuration) lc(k, cnt head) head {\n\tif k < 0 || int(k) > c.N() {\n\t\treturn DOES_NOT_EXIST\n\t}\n\n\tcc := 0\n\tfor i := 1; i < int(k); i++ {\n\t\tif c.Head(i) == int(k) {\n\t\t\tcc++\n\t\t\tif int(cnt) == cc {\n\t\t\t\treturn head(i)\n\t\t\t}\n\t\t}\n\t}\n\treturn DOES_NOT_EXIST\n}\n\nfunc (c *configuration) rc(k, cnt head) head {\n\tif k < 0 || int(k) > c.N() {\n\t\treturn DOES_NOT_EXIST\n\t}\n\n\tcc := 0\n\tfor i := c.N(); i > int(k); i-- {\n\t\tif c.Head(i) == int(k) {\n\t\t\tcc++\n\t\t\tif cc == int(cnt) {\n\t\t\t\treturn head(i)\n\t\t\t}\n\t\t}\n\t}\n\treturn DOES_NOT_EXIST\n}\n\nfunc (c *configuration) hasOtherChildren(i int, goldParse *lingo.Dependency) bool {\n\tfor j := 1; j <= goldParse.N(); j++ {\n\t\tif goldParse.Head(j) == i && c.Head(j) != i {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\nfunc (c *configuration) isTerminal() bool {\n\treturn c.stackSize() == 1 && c.bufferSize() == 0\n}\n\n// Actual Transitioning stuff\nfunc (c *configuration) shift() bool {\n\ti := c.bufferValue(0)\n\tif i == DOES_NOT_EXIST {\n\t\treturn false\n\t}\n\n\tc.bp++ // move the buffer pointer up\n\n\tc.stack = append(c.stack, i) // push to it.... gotta work the pop\n\treturn true\n}\n"
  },
  {
    "path": "dep/configuration_test.go",
    "content": "package dep\n\nimport (\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestStackAppendRemove(t *testing.T) {\n\tsentence := mediumSentence()[0]\n\tas := sentence.AnnotatedSentence(dummyFix{})\n\n\tc := newConfiguration(as, true)\n\tt.Logf(\"C: %v\", c)\n\tt.Logf(\"C: %#v\", c)\n\n\tassert := assert.New(t)\n\n\tc.stack = append(c.stack, 200)\n\tassert.Equal([]head{0, 200}, c.stack, \"stack is not equal after appending\")\n\n\tc.removeTopStack()\n\tassert.Equal([]head{0}, c.stack, \"stack is not equal after removeTopStack\")\n\n\tc.stack = append(c.stack, 200)\n\tc.removeSecondTopStack()\n\tassert.Equal([]head{200}, c.stack, \"stack is not equal after removeSecondTopStack()\")\n\n\tcorrectHeads := []int{-1} // the -1 is the root\n\tcorrectHeads = append(correctHeads, sentence.Heads...)\n\tcorrectLabels := []lingo.DependencyType{lingo.Root}\n\tcorrectLabels = append(correctLabels, sentence.Labels...)\n\n\tdep := sentence.Dependency(dummyFix{})\n\tassert.Equal(correctHeads, dep.Heads(), \"Heads are not equal\")\n\tassert.Equal(correctLabels, dep.Labels(), \"Labels are not equal %v \\n %v\", correctLabels, dep.Labels())\n}\n\nfunc TestConfiguration_StackValue(t *testing.T) {\n\tc := new(configuration)\n\tc.stack = []head{0, 1, 2, 5, 6}\n\n\tzero := c.stackValue(0)\n\tone := c.stackValue(1)\n\tfour := c.stackValue(4)\n\tfive := c.stackValue(5)\n\tnegone := c.stackValue(-1)\n\n\tassert := assert.New(t)\n\tassert.Equal(head(6), zero, \"Zeroth value not the same\")\n\tassert.Equal(head(5), one, \"First value not the same\")\n\tassert.Equal(head(0), four, \"Fourth value not the same\")\n\tassert.Equal(DOES_NOT_EXIST, five, \"Fifth value not the same\")\n\tassert.Equal(DOES_NOT_EXIST, negone, \"NegOne value not the same\")\n\n}\n"
  },
  {
    "path": "dep/debug.go",
    "content": "// +build debug\n\npackage dep\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"log\"\n\t\"runtime\"\n\t\"strings\"\n\t\"sync/atomic\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\nconst BUILD_DEBUG = \"PARSER: DEBUG BUILD\"\nconst BUILD_DIAG = \"Diagnostic Build\"\n\nconst DEBUG = true\n\nvar READMEMSTATS = true\n\nvar TABCOUNT uint32 = 0\n\nfunc tabcount() int {\n\treturn int(atomic.LoadUint32(&TABCOUNT))\n}\n\nfunc enterLoggingContext() {\n\tatomic.AddUint32(&TABCOUNT, 1)\n\ttc := tabcount()\n\tlog.SetPrefix(strings.Repeat(\"\\t\", tc))\n}\n\nfunc leaveLoggingContext() {\n\ttc := tabcount()\n\ttc--\n\n\tif tc < 0 {\n\t\tatomic.StoreUint32(&TABCOUNT, 0)\n\t\ttc = 0\n\t} else {\n\t\tatomic.StoreUint32(&TABCOUNT, uint32(tc))\n\t}\n\tlog.SetPrefix(strings.Repeat(\"\\t\", tc))\n}\n\nfunc logf(format string, others ...interface{}) {\n\tif !DEBUG {\n\t\treturn\n\t}\n\tlog.Printf(format, others...)\n}\n\nfunc logTrainingProgress(iteration, correct, total, length, possibles int) {\n\tif !DEBUG {\n\t\treturn\n\t}\n\n\tlog.Printf(\"Iteration %d. Correct/Total: %d/%d = %.2f\", iteration, correct, total, float64(correct)/float64(total))\n\tlog.Printf(\"DictSize: %d/%d, load factor of: %.2f\", length, possibles, float64(length)/float64(possibles))\n}\n\nfunc logMemStats() {\n\tif !DEBUG || !READMEMSTATS {\n\t\treturn\n\t}\n\n\tvar mem runtime.MemStats\n\truntime.ReadMemStats(&mem)\n\n\tlog.Printf(\"Allocated          : %.2f MB\", (float64(mem.Alloc)/1024)/float64(1024))\n\tlog.Printf(\"Total Allocated    : %.2f MB\", (float64(mem.TotalAlloc)/1024)/float64(1024))\n\tlog.Printf(\"Heap Allocted      : %.2f MB\", (float64(mem.HeapAlloc)/1024)/float64(1024))\n\tlog.Printf(\"Sys Total Allocated: %.2f MB\", (float64(mem.HeapSys)/1024)/float64(1024))\n\tlog.Println(\"----------\")\n}\n\nfunc recoverFrom(format string, attrs ...interface{}) {\n\tif r := recover(); r != nil {\n\t\tlog.Printf(format, attrs...)\n\t\tpanic(r)\n\t}\n}\n\n/* Nice output of shit */\nfunc (d *Parser) SprintFeatures(features []int) string {\n\t// tabcount := int(atomic.LoadUint32(&TABCOUNT))\n\n\tvar buf bytes.Buffer\n\n\tfor i := 0; i < 18; i++ {\n\t\tnumber := features[i]\n\t\tid := number - wordFeatsStartAt\n\t\tword, _ := d.corpus.Word(id)\n\n\t\tif word == \"\" {\n\t\t\tword = \"-NULL-\"\n\t\t}\n\n\t\tbuf.WriteString(fmt.Sprintf(\"%d, %q, %d \\n\", feature(i), word, number))\n\t}\n\n\tfor i := 0; i < 18; i++ {\n\t\tnumber := features[i+18]\n\n\t\tbuf.WriteString(fmt.Sprintf(\"%d, %v, %d\\n\", feature(i+18), lingo.POSTag(number), number))\n\t}\n\n\tfor i := 0; i < 12; i++ {\n\t\tnumber := features[i+36]\n\t\tid := number - labelFeatsStartAt\n\n\t\tbuf.WriteString(fmt.Sprintf(\"%d, %v, %d\\n\", feature(i+36), lingo.DependencyType(id), number))\n\t}\n\n\treturn buf.String()\n}\n\nfunc SprintScores(scores []float64, ts []transition) string {\n\tvar buf bytes.Buffer\n\tfor i, v := range scores {\n\t\tif i >= len(ts) {\n\t\t\tbuf.WriteString(fmt.Sprintf(\"UNKNOWN TRANSITION, %v\\n\", v))\n\t\t\tcontinue\n\t\t}\n\t\tbuf.WriteString(fmt.Sprintf(\"%v, %v\\n\", ts[i], v))\n\t}\n\treturn buf.String()\n}\n\nfunc SprintFloatSlice(a []float64) string {\n\tvar buf bytes.Buffer\n\tbuf.WriteString(\"[\")\n\tfor i, v := range a {\n\t\tif i < len(a)-1 {\n\t\t\tbuf.WriteString(fmt.Sprintf(\"%v, \", v))\n\t\t} else {\n\t\t\tbuf.WriteString(fmt.Sprintf(\"%v\", v))\n\t\t}\n\t}\n\tbuf.WriteString(\"]\")\n\treturn buf.String()\n}\n"
  },
  {
    "path": "dep/dependencyParser.go",
    "content": "package dep\n\nimport (\n\t\"fmt\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/corpus\"\n\t\"github.com/pkg/errors\"\n)\n\nvar KnownWords *corpus.Corpus // package provided global\n\n// Parser is the object that performs the dependency parsing\n// It contains a neural network, which is the core of it.\n//\n// The same object can be used to train the NN\ntype Parser struct {\n\tInput  chan lingo.AnnotatedSentence\n\tOutput chan *lingo.Dependency\n\tError  chan error\n\n\t*Model\n}\n\n// New creates a new Parser\nfunc New(m *Model) *Parser {\n\td := &Parser{\n\t\tOutput: make(chan *lingo.Dependency),\n\t\tError:  make(chan error),\n\n\t\tModel: m,\n\t}\n\n\treturn d\n}\n\n// Run is used when using the NN to parse a sentence. For training, see Train()\nfunc (d *Parser) Run() {\n\tdefer close(d.Output)\n\tfor sentence := range d.Input {\n\t\tdep, err := d.predict(sentence)\n\n\t\tif err != nil {\n\t\t\td.Error <- err\n\t\t\treturn\n\t\t}\n\t\td.Output <- dep\n\t}\n\treturn\n}\n\nfunc (d *Parser) predict(sentence lingo.AnnotatedSentence) (*lingo.Dependency, error) {\n\t// defer func() {\n\t// \tif r := recover(); r != nil {\n\t// \t\tlog.Printf(\"Parsing for %q\", sentence.ValueString())\n\t// \t\tpanic(r)\n\t// \t}\n\t// }()\n\tc := newConfiguration(sentence, false)\n\n\tvar err error\n\tvar argmax int\n\tvar count int\n\tfor !c.isTerminal() && count < 100 {\n\t\tlogf(\"%v\", c)\n\t\tif count == 99 {\n\t\t\tlogf(\"TARPIT\")\n\t\t}\n\n\t\tfeatures := getFeatures(c, d.corpus)\n\t\t// features2 := getFeatureArray(c, d.dict)\n\n\t\tif argmax, err = d.nn.pred(features); err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t\t// log.Printf(\"Argmax: %v, len(d.ts): %v, len(transitions) %v\", argmax, len(d.ts), len(transitions))\n\t\tt := transitions[argmax] // no this is NOT a mistake\n\t\tif !c.canApply(t) {\n\t\t\tt = transition{Shift, lingo.NoDepType} // reset\n\t\t\t// manual argmaxing\n\t\t\tswitch scores := d.nn.scores.Value().Data().(type) {\n\t\t\tcase []float32:\n\t\t\t\tvar maxScore float32\n\t\t\t\tfor i, kt := range d.ts {\n\t\t\t\t\tif scores[i] > maxScore && c.canApply(kt) {\n\t\t\t\t\t\tmaxScore = scores[i]\n\t\t\t\t\t\tt = kt\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\tcase []float64:\n\t\t\t\tvar maxScore float64\n\t\t\t\tfor i, kt := range d.ts {\n\t\t\t\t\tif scores[i] > maxScore && c.canApply(kt) {\n\t\t\t\t\t\tmaxScore = scores[i]\n\t\t\t\t\t\tt = kt\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\tdefault:\n\t\t\t\treturn nil, errors.Errorf(\"Unhandled score type %T\", d.nn.scores.Value())\n\t\t\t}\n\n\t\t}\n\t\tc.apply(t)\n\n\t\tcount++\n\t}\n\tfix(c.Dependency)\n\treturn c.Dependency, err\n}\n\nfunc (d *Parser) String() string {\n\tvar nns, ds string\n\n\tif d.corpus != nil {\n\t\tds = fmt.Sprintf(\"\\nDict Size: %d words\\nMAXTAG: %d\\nMAXDEPTYPE: %d\\n\", d.corpus.Size(), lingo.MAXTAG, lingo.MAXDEPTYPE)\n\t} else {\n\t\tds = \"\\n\"\n\t}\n\n\tif d.nn != nil && d.nn.initialized() {\n\t\tnns = fmt.Sprintf(\"\\nNeural Network:\\n=================\\n%v\\n\", d.nn)\n\t}\n\n\tif !d.nn.initialized() {\n\t\tpanic(fmt.Sprintf(\"%v\", d.nn))\n\t}\n\n\tbase := \"\\n\\nDependency Parser Info:\\n=======================\\n\"\n\treturn base + ds + nns\n}\n"
  },
  {
    "path": "dep/documentation/iamhuman.dot",
    "content": "digraph G {\n\tNode_0xc425b88740->Node_0xc425b88780[ label=Root ];\n\tNode_0xc425b88780->Node_0xc425b88800[ label=Cop ];\n\tNode_0xc425b88780->Node_0xc425b887c0[ label=NSubj ];\n\tNode_0xc425b88740 [ label=\"0: &#34;-ROOT-/ROOT_TAG&#34;\" ];\n\tNode_0xc425b88780 [ label=\"3: &#34;human/JJ&#34;\" ];\n\tNode_0xc425b887c0 [ label=\"1: &#34;I/PRP&#34;\" ];\n\tNode_0xc425b88800 [ label=\"2: &#34;am/VBP&#34;\" ];\n\n}"
  },
  {
    "path": "dep/documentation/thecatsatonthemat.dot",
    "content": "digraph G {\n\tNode_0xc4349eeec0->Node_0xc4349eef80[ label=Root ];\n\tNode_0xc4349eef80->Node_0xc4349eefc0[ label=NMod ];\n\tNode_0xc4349eefc0->Node_0xc4349ef040[ label=Det ];\n\tNode_0xc4349eef80->Node_0xc4349eef00[ label=NSubj ];\n\tNode_0xc4349eef00->Node_0xc4349eef40[ label=Det ];\n\tNode_0xc4349eefc0->Node_0xc4349ef000[ label=Case ];\n\tNode_0xc4349eeec0 [ label=\"0: &#34;-ROOT-/ROOT_TAG&#34;\" ];\n\tNode_0xc4349eef00 [ label=\"2: &#34;cat/NN&#34;\" ];\n\tNode_0xc4349eef40 [ label=\"1: &#34;the/DT&#34;\" ];\n\tNode_0xc4349eef80 [ label=\"3: &#34;sat/VBD&#34;\" ];\n\tNode_0xc4349eefc0 [ label=\"6: &#34;mat/NN&#34;\" ];\n\tNode_0xc4349ef000 [ label=\"4: &#34;on/IN&#34;\" ];\n\tNode_0xc4349ef040 [ label=\"5: &#34;the/DT&#34;\" ];\n\n}\n\n"
  },
  {
    "path": "dep/errors.go",
    "content": "package dep\n\nimport (\n\t\"fmt\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\ntype componentUnavailable string\n\nfunc (c componentUnavailable) Error() string     { return fmt.Sprintf(\"%v unavailable\", c) }\nfunc (c componentUnavailable) Component() string { return string(c) }\n\n// TarpitError is an error when the arc-standard is stuck.\n// It implements GoStringer, which when called will output the state as a string.\n// It also implements lingo.Sentencer, so the offending sentence can easily be retrieved\ntype TarpitError struct{ *configuration }\n\nfunc (err TarpitError) Error() string { return \"Tarpit Error\" }\n\n// NonProjective error is the error that is emitted when the dependency tree is not projective (that is to say the children cross lines)\ntype NonProjectiveError struct{ *lingo.Dependency }\n\nfunc (err NonProjectiveError) Error() string { return \"Non-projective tree\" }\n"
  },
  {
    "path": "dep/evaluation.go",
    "content": "package dep\n\nimport (\n\t\"fmt\"\n\t\"io/ioutil\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/treebank\"\n)\n\n// Performance is a tuple that holds performance information from a training session\ntype Performance struct {\n\tIter int     // which training iteration is this?\n\tUAS  float64 // Unlabelled Attachment Score\n\tLAS  float64 // Labeled Attachment Score\n\tUEM  float64 // Unlabelled Exact Match\n\tRoot float64 // Correct Roots Ratio\n}\n\nfunc (p Performance) String() string {\n\ts := `EPO: %d\nUAS: %.5f\nLAS: %.5f\nUEM: %.5f\nROO: %.5f`\n\n\treturn fmt.Sprintf(s, p.Iter, p.UAS, p.LAS, p.UEM, p.Root)\n}\n\n// performance evaluation related code goes here\n\n// Evaluate compares predicted trees with the gold standard trees and returns a Performance. It panics if the number of predicted trees and the number of gold trees aren't the same\nfunc Evaluate(predictedTrees, goldTrees []*lingo.Dependency) Performance {\n\tif len(predictedTrees) != len(goldTrees) {\n\t\tpanic(fmt.Sprintf(\"%d predicted trees; %d gold trees. Unable to compare\", len(predictedTrees), len(goldTrees)))\n\t}\n\n\tvar correctLabels, correctHeads, correctTrees, correctRoot, sumArcs float64\n\tvar check int\n\n\tfor i, tr := range predictedTrees {\n\t\tgTr := goldTrees[i]\n\n\t\tif len(tr.AnnotatedSentence) != len(gTr.AnnotatedSentence) {\n\t\t\tsumArcs += float64(gTr.N())\n\n\t\t\t// log.Printf(\"WARNING: %q and %q do not have the same length\", tr, gTr)\n\t\t\tcontinue\n\t\t}\n\n\t\tvar nCorrectHead int\n\t\tfor j, a := range tr.AnnotatedSentence[1:] {\n\t\t\tb := gTr.AnnotatedSentence[j+1]\n\t\t\tif a.HeadID() == b.HeadID() {\n\t\t\t\tcorrectHeads++\n\t\t\t\tnCorrectHead++\n\t\t\t}\n\n\t\t\tif a.DependencyType == b.DependencyType {\n\t\t\t\tcorrectLabels++\n\t\t\t}\n\t\t\tsumArcs++\n\t\t}\n\t\tif nCorrectHead == gTr.N() {\n\t\t\tcorrectTrees++\n\t\t}\n\t\tif tr.Root() == gTr.Root() {\n\t\t\tcorrectRoot++\n\t\t}\n\n\t\t// check 5 per iteration\n\t\tif check < 5 {\n\t\t\tlogf(\"predictedHeads: \\n%v\\n%v\\n\", tr.Heads(), gTr.Heads())\n\t\t\tlogf(\"Ns: %v | %v || Correct: %v\", tr.N(), gTr.N(), nCorrectHead)\n\t\t\tcheck++\n\t\t}\n\t}\n\n\tuas := correctHeads / sumArcs\n\tlas := correctLabels / sumArcs\n\tuem := correctTrees / float64(len(predictedTrees))\n\troo := correctRoot / float64(len(predictedTrees))\n\n\treturn Performance{UAS: uas, LAS: las, UEM: uem, Root: roo}\n}\n\nfunc (t *Trainer) crossValidate(st []treebank.SentenceTag) Performance {\n\tpreds := t.predMany(st)\n\tgolds := make([]*lingo.Dependency, len(st))\n\n\tfor i, s := range st {\n\t\tgolds[i] = s.Dependency(t)\n\t}\n\treturn Evaluate(preds, golds)\n}\n\nfunc (t *Trainer) predMany(sentenceTags []treebank.SentenceTag) []*lingo.Dependency {\n\tretVal := make([]*lingo.Dependency, len(sentenceTags))\n\tfor i, st := range sentenceTags {\n\t\tdep, err := t.pred(st.AnnotatedSentence(t))\n\t\tif err != nil {\n\t\t\tioutil.WriteFile(\"fullGraph.dot\", []byte(t.nn.g.ToDot()), 0644)\n\t\t\tpanic(fmt.Sprintf(\"%+v\", err))\n\t\t}\n\t\tretVal[i] = dep\n\t}\n\treturn retVal\n}\n\nfunc (t *Trainer) pred(as lingo.AnnotatedSentence) (*lingo.Dependency, error) {\n\td := new(Parser)\n\td.Model = t.Model\n\n\treturn d.predict(as)\n}\n"
  },
  {
    "path": "dep/example.go",
    "content": "package dep\n\nimport (\n\t\"math/rand\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/corpus\"\n\t\"github.com/chewxy/lingo/treebank\"\n)\n\n// example is a training example.\ntype example struct {\n\ttransition\n\n\tfeatures []int // features are used in the embeddings\n\tlabels   []int // labels are used in scoring the transitions\n}\n\nfunc makeExamples(sentenceTags []treebank.SentenceTag, conf NNConfig, dict *corpus.Corpus, ts []transition, f lingo.AnnotationFixer) []example {\n\tvar examples []example\n\n\tvar tarpit, nonprojective, good int\n\tfor i, sentenceTag := range sentenceTags {\n\t\texs, err := makeOneExample(i, sentenceTag, dict, ts, f)\n\t\tif err != nil {\n\t\t\tswitch err.(type) {\n\t\t\tcase TarpitError:\n\t\t\t\ttarpit++\n\t\t\tcase NonProjectiveError:\n\t\t\t\tnonprojective++\n\t\t\t}\n\t\t} else {\n\t\t\texamples = append(examples, exs...)\n\t\t\tgood++\n\t\t}\n\t}\n\n\tlogf(\"Number of SentenceTags Generated Into Examples: %d/%d | Number of Examples: %d | Number of nonprojective examples: %d | Number of tarpit examples: %d\", good, len(sentenceTags), len(examples), nonprojective, tarpit)\n\treturn examples\n}\n\n// makeOneExample is an example of a poorly named function. It makes an example from a SentenceTag\nfunc makeOneExample(i int, sentenceTag treebank.SentenceTag, dict *corpus.Corpus, ts []transition, f lingo.AnnotationFixer) ([]example, error) {\n\tvar examples []example\n\n\ts := sentenceTag.AnnotatedSentence(f)\n\tdep := s.Dependency()\n\tif dep.IsProjective() {\n\t\tc := newConfiguration(s, true)\n\n\t\tcount := 0\n\t\tfor !c.isTerminal() && count < 1000 {\n\t\t\tif count == 999 {\n\t\t\t\treturn examples, TarpitError{c}\n\t\t\t}\n\n\t\t\toracle := c.oracle(dep)\n\t\t\tfeatures := getFeatures(c, dict)\n\n\t\t\tlabels := make([]int, MAXTRANSITION)\n\t\t\tfor i, t := range ts {\n\t\t\t\tif t == oracle {\n\t\t\t\t\tlabels[i] = 1\n\t\t\t\t} else if c.canApply(t) {\n\t\t\t\t\tlabels[i] = 0\n\t\t\t\t} else {\n\t\t\t\t\tlabels[i] = -1\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tex := example{transition{oracle.Move, oracle.DependencyType}, features, labels}\n\t\t\texamples = append(examples, ex)\n\n\t\t\tc.apply(oracle)\n\t\t\tcount++\n\t\t}\n\t} else {\n\t\treturn nil, NonProjectiveError{dep}\n\t}\n\n\treturn examples, nil\n}\n\nfunc shuffleExamples(a []example) {\n\tfor i := range a {\n\t\tj := rand.Intn(i + 1)\n\t\ta[i], a[j] = a[j], a[i]\n\t}\n}\n"
  },
  {
    "path": "dep/example_test.go",
    "content": "package dep\n\nimport (\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo/corpus\"\n)\n\nfunc TestMakeExamples(t *testing.T) {\n\tst := simpleSentence()\n\tdict := corpus.GenerateCorpus(st)\n\n\texs := makeExamples(st, DefaultNNConfig, dict, transitions, dummyFix{})\n\tif len(exs) != 20 {\n\t\tt.Error(\"Expected 20 examples to be generated from simple sentence\")\n\t}\n}\n"
  },
  {
    "path": "dep/featureExtraction.go",
    "content": "package dep\n\nimport (\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/corpus\"\n)\n\n// getFeatures extracts the IDs to pass into the neural network. These IDs are used in the network to construct the  input layers\nfunc getFeatures(c *configuration, dict *corpus.Corpus) []int {\n\t// logf(\"CONFIG: %v\", c)\n\twordFeats := make([]int, 0)\n\tposFeats := make([]lingo.POSTag, 0)\n\tlabelFeats := make([]lingo.DependencyType, 0)\n\tunknownID, _ := dict.Id(\"-UNKNOWN-\")\n\n\tfor j := 2; j >= 0; j-- {\n\t\tindex := c.stackValue(j)\n\t\tmor := c.annotation(index)\n\n\t\tif wordID, ok := dict.Id(mor.Value); ok {\n\t\t\twordFeats = append(wordFeats, wordID)\n\t\t} else {\n\t\t\twordFeats = append(wordFeats, unknownID)\n\t\t}\n\t\tposFeats = append(posFeats, mor.POSTag)\n\t}\n\n\t// logf(\"wordFeats: %v\", wordFeats)\n\n\tfor j := 0; j <= 2; j++ {\n\t\tindex := c.bufferValue(j)\n\t\tmor := c.annotation(index)\n\t\t// logf(\"Want: %v Index: %d. Morpheme: %v\", j, index, mor)\n\n\t\tif wordID, ok := dict.Id(mor.Value); ok {\n\t\t\twordFeats = append(wordFeats, wordID)\n\t\t} else {\n\t\t\twordFeats = append(wordFeats, unknownID)\n\t\t}\n\t\tposFeats = append(posFeats, mor.POSTag)\n\t}\n\t// logf(\"wordFeats: %v\", wordFeats)\n\n\tfor j := 0; j <= 1; j++ {\n\t\tk := c.stackValue(j)\n\n\t\tindex := c.lc(k, 1)\n\t\tmor := c.annotation(index)\n\t\tif wordID, ok := dict.Id(mor.Value); ok {\n\t\t\twordFeats = append(wordFeats, wordID)\n\t\t} else {\n\t\t\twordFeats = append(wordFeats, unknownID)\n\t\t}\n\t\tposFeats = append(posFeats, mor.POSTag)\n\t\tlabelFeats = append(labelFeats, c.label(index))\n\n\t\tindex = c.rc(k, 1)\n\t\tmor = c.annotation(index)\n\t\tif wordID, ok := dict.Id(mor.Value); ok {\n\t\t\twordFeats = append(wordFeats, wordID)\n\t\t} else {\n\t\t\twordFeats = append(wordFeats, unknownID)\n\t\t}\n\t\tposFeats = append(posFeats, mor.POSTag)\n\t\tlabelFeats = append(labelFeats, c.label(index))\n\n\t\tindex = c.lc(k, 2)\n\t\tmor = c.annotation(index)\n\t\tif wordID, ok := dict.Id(mor.Value); ok {\n\t\t\twordFeats = append(wordFeats, wordID)\n\t\t} else {\n\t\t\twordFeats = append(wordFeats, unknownID)\n\t\t}\n\t\tposFeats = append(posFeats, mor.POSTag)\n\t\tlabelFeats = append(labelFeats, c.label(index))\n\n\t\tindex = c.rc(k, 2)\n\t\tmor = c.annotation(index)\n\t\tif wordID, ok := dict.Id(mor.Value); ok {\n\t\t\twordFeats = append(wordFeats, wordID)\n\t\t} else {\n\t\t\twordFeats = append(wordFeats, unknownID)\n\t\t}\n\t\tposFeats = append(posFeats, mor.POSTag)\n\t\tlabelFeats = append(labelFeats, c.label(index))\n\n\t\tleftChild := c.lc(k, 1)\n\t\tindex = c.lc(leftChild, 1)\n\t\tmor = c.annotation(index)\n\t\tif wordID, ok := dict.Id(mor.Value); ok {\n\t\t\twordFeats = append(wordFeats, wordID)\n\t\t} else {\n\t\t\twordFeats = append(wordFeats, unknownID)\n\t\t}\n\t\tposFeats = append(posFeats, mor.POSTag)\n\t\tlabelFeats = append(labelFeats, c.label(index))\n\n\t\trightChild := c.rc(k, 1)\n\t\tindex = c.rc(rightChild, 1)\n\t\tmor = c.annotation(index)\n\t\tif wordID, ok := dict.Id(mor.Value); ok {\n\t\t\twordFeats = append(wordFeats, wordID)\n\t\t} else {\n\t\t\twordFeats = append(wordFeats, unknownID)\n\t\t}\n\t\tposFeats = append(posFeats, mor.POSTag)\n\t\tlabelFeats = append(labelFeats, c.label(index))\n\t}\n\n\t// the embedding matrix is arranged thus:\n\t/*\n\t\tPOSTag0 0, 1, ... 50\n\t\tPOSTag1\n\t\t...\n\t\tMAXTAG-1\n\t\tDepType0\n\t\tDepType1\n\t\t...\n\t\tMAXDEPTYPE-1\n\t\tWordID0\n\t\t...\n\t\tWordIDN\n\t*/\n\n\tfeatures := make([]int, MAXFEATURE)\n\n\tfor i, w := range wordFeats {\n\t\tfeatures[i] = w + wordFeatsStartAt\n\t}\n\tfor i, t := range posFeats {\n\t\tfeatures[i+POS_OFFSET] = int(t)\n\t}\n\tfor i, l := range labelFeats {\n\t\tfeatures[i+DEP_OFFSET] = int(l) + labelFeatsStartAt\n\t}\n\n\treturn features\n}\n\nconst (\n\tPOS_OFFSET   int = 18\n\tDEP_OFFSET       = 36\n\tSTACK_OFFSET     = 6\n\tSTACK_NUMBER     = 6\n)\n"
  },
  {
    "path": "dep/features.go",
    "content": "package dep\n\nimport \"github.com/chewxy/lingo\"\n\n// the features are used as columns in the matrix\n\n// go:generate stringer type=feature -output=feature_string.go\ntype feature int\n\nconst (\n\t// first 18 are word related features\n\t// second 18 are POS related features\n\t// last 12 are label related features\n\n\ts0w feature = iota\n\ts1w\n\ts2w\n\n\tb0w\n\tb1w\n\tb2w\n\n\ts0l1w\n\ts0r1w\n\ts0l2w\n\ts0r2w\n\ts0llw\n\ts0rrw\n\n\ts1l1w\n\ts1r1w\n\ts1l2w\n\ts1r2w\n\ts1llw\n\ts1rrw\n\n\t// POS related words\n\ts0t\n\ts1t\n\ts2t\n\n\tb0t\n\tb1t\n\tb2t\n\n\ts0l1t\n\ts0r1t\n\ts0l2t\n\ts0r2t\n\ts0llt\n\ts0rrt\n\n\ts1l1t\n\ts1r1t\n\ts1l2t\n\ts1r2t\n\ts1llt\n\ts1rrt\n\n\t// label related\n\ts0l1d\n\ts0r1d\n\ts0l2d\n\ts0r2d\n\ts0lld\n\ts0rrd\n\n\ts1l1d\n\ts1r1d\n\ts1l2d\n\ts1r2d\n\ts1lld\n\ts1rrd\n\n\tMAXFEATURE\n)\n\nconst (\n\twordFeatsStartAt  int = int(lingo.MAXTAG) + int(lingo.MAXDEPTYPE)\n\tlabelFeatsStartAt     = int(lingo.MAXTAG)\n\tposFeatsStartAt       = 0\n)\n"
  },
  {
    "path": "dep/features_string.go",
    "content": "// generated by stringer -type=feature -output=features_string.go; DO NOT EDIT\n\npackage dep\n\nimport \"fmt\"\n\nconst _feature_name = \"s0ws1ws2wb0wb1wb2ws0l1ws0r1ws0l2ws0r2ws0llws0rrws1l1ws1r1ws1l2ws1r2ws1llws1rrws0ts1ts2tb0tb1tb2ts0l1ts0r1ts0l2ts0r2ts0llts0rrts1l1ts1r1ts1l2ts1r2ts1llts1rrts0l1ds0r1ds0l2ds0r2ds0llds0rrds1l1ds1r1ds1l2ds1r2ds1llds1rrdMAXFEATURE\"\n\nvar _feature_index = [...]uint8{0, 3, 6, 9, 12, 15, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68, 73, 78, 81, 84, 87, 90, 93, 96, 101, 106, 111, 116, 121, 126, 131, 136, 141, 146, 151, 156, 161, 166, 171, 176, 181, 186, 191, 196, 201, 206, 211, 216, 226}\n\nfunc (i feature) String() string {\n\tif i < 0 || i >= feature(len(_feature_index)-1) {\n\t\treturn fmt.Sprintf(\"feature(%d)\", i)\n\t}\n\treturn _feature_name[_feature_index[i]:_feature_index[i+1]]\n}\n"
  },
  {
    "path": "dep/fix.go",
    "content": "package dep\n\nimport (\n\t\"log\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\n// applies common fixes\nfunc fix(d *lingo.Dependency) {\n\t// NNP fix:\n\t// If a sentence is [a, b, c, D, E, f, g]\n\t// where D, E are NNPs, they should be compound words\n\t// The head should be the one with higher headID\n\tspans := properNounSpans(d)\n\tfor _, s := range spans {\n\t\t// we don't care about single word proper nouns\n\t\tif s.end-s.start <= 1 {\n\t\t\tcontinue\n\t\t}\n\n\t\tphrase := d.AnnotatedSentence[s.start:s.end]\n\n\t\t// pick up all compound roots\n\t\t// find annotations that do not have compound as deptype\n\t\tvar compoundRoots lingo.AnnotationSet\n\t\tvar problematic lingo.AnnotationSet\n\t\tfor _, a := range phrase {\n\t\t\tif lingo.IsCompound(a.DependencyType) {\n\t\t\t\tcompoundRoots = compoundRoots.Add(a.Head)\n\t\t\t}\n\n\t\t\tif !lingo.IsCompound(a.DependencyType) && a.ID != s.end-1 {\n\t\t\t\tproblematic = problematic.Add(a)\n\t\t\t}\n\t\t}\n\n\t\t// if no root\n\t\tif len(compoundRoots) == 0 {\n\t\t\t// actual root is the word with the largest ID\n\t\t\tvar compoundRoot *lingo.Annotation\n\t\t\tvar rootRoot *lingo.Annotation\n\t\t\tfor last := -1; s.end+last >= s.start; last-- {\n\t\t\t\tpredictedRoot := s.end + last\n\t\t\t\tcompoundRoot = d.AnnotatedSentence[predictedRoot]\n\n\t\t\t\t// incorrects :\n\t\t\t\t//\tdep==Dep\n\t\t\t\t// \tdep==Root && others has dep != root\n\n\t\t\t\tif compoundRoot.DependencyType == lingo.Dep {\n\t\t\t\t\tproblematic = problematic.Add(compoundRoot)\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\n\t\t\t\tif compoundRoot.DependencyType != lingo.Dep && compoundRoot.DependencyType != lingo.Root {\n\t\t\t\t\tbreak\n\t\t\t\t}\n\n\t\t\t\tif compoundRoot.DependencyType == lingo.Root {\n\t\t\t\t\trootRoot = compoundRoot\n\t\t\t\t\tproblematic = problematic.Add(compoundRoot)\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tif rootRoot != nil && rootRoot != compoundRoot {\n\t\t\t\t// we have two potential roots. Choose the best\n\t\t\t\tlog.Println(\"Problem when fixing: more than one possible compound root found\")\n\t\t\t}\n\n\t\t\tfor _, a := range problematic {\n\t\t\t\tif a == compoundRoot {\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\t\t\t\ttmpHead := a.Head\n\t\t\t\ttmpRel := a.DependencyType\n\n\t\t\t\ta.SetHead(compoundRoot)\n\t\t\t\ta.DependencyType = lingo.Compound\n\n\t\t\t\tfor _, childID := range d.AnnotatedSentence.Children(a.ID) {\n\t\t\t\t\tchildA := d.AnnotatedSentence[childID]\n\t\t\t\t\tchildA.SetHead(tmpHead)\n\t\t\t\t\tchildA.DependencyType = tmpRel\n\t\t\t\t}\n\t\t\t}\n\n\t\t}\n\n\t\t// if more than one root...\n\t\tlogf(\"More than zero compound roots not handled yet\")\n\n\t}\n\n\t// Number fix\n}\n\nfunc properNounSpans(d *lingo.Dependency) (retVal []span) {\n\tstart := -1\n\tend := -1\n\tfor i, a := range d.AnnotatedSentence {\n\t\tif lingo.IsProperNoun(a.POSTag) {\n\t\t\tif start == -1 {\n\t\t\t\tstart = i\n\t\t\t\tend = i + 1\n\t\t\t} else {\n\t\t\t\tend = i + 1\n\t\t\t}\n\t\t} else {\n\t\t\tif end == -1 {\n\t\t\t\tend = i\n\t\t\t}\n\n\t\t\tif start > -1 {\n\t\t\t\ts := makeSpan(start, end)\n\t\t\t\tretVal = append(retVal, s)\n\t\t\t}\n\n\t\t\tstart = -1\n\t\t\tend = -1\n\t\t}\n\t}\n\n\tif start > -1 {\n\t\ts := makeSpan(start, len(d.AnnotatedSentence))\n\t\tretVal = append(retVal, s)\n\t}\n\treturn\n}\n"
  },
  {
    "path": "dep/init.go",
    "content": "package dep\n\nimport \"github.com/chewxy/lingo/corpus\"\n\nfunc init() {\n\tc := corpus.New()\n\tc.Add(\"\") // add null words\n\n\tKnownWords = c\n}\n"
  },
  {
    "path": "dep/models.go",
    "content": "package dep\n\nimport (\n\t\"bufio\"\n\t\"bytes\"\n\t\"encoding/gob\"\n\t\"fmt\"\n\t\"io\"\n\t\"os\"\n\n\t\"github.com/chewxy/lingo/corpus\"\n\t\"github.com/pkg/errors\"\n\t\"gorgonia.org/tensor\"\n)\n\n// Model holds the neural network that a DependencyParser uses. To train, use a Trainer\ntype Model struct {\n\tnn     *neuralnetwork2\n\tcorpus *corpus.Corpus\n\tts     []transition\n}\n\nfunc (m *Model) Corpus() *corpus.Corpus { return m.corpus }\n\nfunc (m *Model) WordEmbeddings() *tensor.Dense {\n\tval := m.nn.e_w.Value().(*tensor.Dense)\n\temb := val.Clone().(*tensor.Dense)\n\treturn emb\n}\n\nfunc (m *Model) POSTagEmbeddings() *tensor.Dense {\n\tval := m.nn.e_t.Value().(*tensor.Dense)\n\temb := val.Clone().(*tensor.Dense)\n\treturn emb\n}\n\nfunc (m *Model) String() string {\n\tvar buf bytes.Buffer\n\tbuf.WriteString(m.nn.String())\n\tbuf.WriteString(\"Transitions: [\")\n\tfor _, t := range m.ts {\n\t\tfmt.Fprintf(&buf, \"%v, \", t)\n\t}\n\tbuf.WriteString(\"]\")\n\treturn buf.String()\n}\n\nfunc (m *Model) Save(filename string) error {\n\tif m.nn == nil {\n\t\treturn errors.Errorf(\"Cannot save a model with no nn\")\n\t}\n\n\tf, err := os.Create(filename)\n\tif err != nil {\n\t\treturn err\n\t}\n\treturn m.SaveWriter(f)\n}\n\nfunc (m *Model) SaveWriter(f io.WriteCloser) error {\n\tdefer f.Close()\n\tw := bufio.NewWriter(f)\n\tdefer w.Flush()\n\tencoder := gob.NewEncoder(w)\n\n\tif err := encoder.Encode(m.corpus); err != nil {\n\t\treturn err\n\t}\n\n\tif err := encoder.Encode(m.nn); err != nil {\n\t\treturn err\n\t}\n\n\t// if err := encoder.Encode(m.ts); err != nil {\n\t// \treturn err\n\t// }\n\n\treturn nil\n}\n\nfunc Load(filename string) (*Model, error) {\n\tf, err := os.Open(filename)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\treturn LoadReader(f)\n}\n\nfunc LoadReader(rd io.ReadCloser) (*Model, error) {\n\tdefer rd.Close()\n\tr := bufio.NewReader(rd)\n\tdecoder := gob.NewDecoder(r)\n\n\tm := new(Model)\n\tif err := decoder.Decode(&m.corpus); err != nil {\n\t\treturn nil, err\n\t}\n\n\tm.nn = new(neuralnetwork2)\n\tm.nn.dict = m.corpus\n\n\tif err := decoder.Decode(&m.nn); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := decoder.Decode(&m.ts); err != nil {\n\t\tm.ts = transitions\n\t}\n\tm.nn.transitions = m.ts\n\n\treturn m, nil\n\n}\n"
  },
  {
    "path": "dep/models_test.go",
    "content": "package dep\n\nimport (\n\t\"os\"\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n\tG \"gorgonia.org/gorgonia\"\n)\n\nfunc TestModel_SaveLoad(t *testing.T) {\n\tassert := assert.New(t)\n\n\ttestFileName := \"TestSave.dat\"\n\tm := new(Model)\n\n\t// dumb shit\n\tif err := m.Save(testFileName); err == nil {\n\t\tt.Error(\"Expected an error\")\n\t}\n\n\tconf := DefaultNNConfig\n\tconf.Dtype = G.Float32\n\tm = new(Model)\n\tm.ts = transitions\n\tm.corpus = KnownWords\n\n\tm.nn = new(neuralnetwork2)\n\tm.nn.NNConfig = conf\n\tm.nn.dict = m.corpus\n\n\tif err := m.nn.init(); err != nil {\n\t\tt.Error(err)\n\t}\n\n\tif err := m.Save(testFileName); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tvar m2 *Model\n\tvar err error\n\tif m2, err = Load(testFileName); err != nil {\n\t\tt.Error(err)\n\n\t}\n\n\tassert.Equal(m.corpus, m2.corpus, \"Both Dependency Parsers need to have the same dict\")\n\n\tif !G.ValueEq(m.nn.w2.Value(), m2.nn.w2.Value()) {\n\t\tt.Errorf(\"Expected w2 to be equal\")\n\t}\n\tif !G.ValueEq(m.nn.e_w.Value(), m2.nn.e_w.Value()) {\n\t\tt.Errorf(\"Expected e_w to be equal\")\n\t}\n\n\t// cleanup\n\tif err := os.Remove(testFileName); err != nil {\n\t\tt.Error(err)\n\t}\n}\n"
  },
  {
    "path": "dep/move.go",
    "content": "package dep\n\n// Move is an action that the dependency parser can take - whether to Shift, Attach-Left, or AttachRight\ntype Move byte\n\n//go:generate stringer -type=Move\n\nconst (\n\tShift Move = iota\n\tLeft\n\tRight\n\n\tMAXMOVE\n)\n\n// ALLMOVES is the set of all possible moves\nvar ALLMOVES = [...]Move{Left, Right, Shift}\n"
  },
  {
    "path": "dep/move_string.go",
    "content": "// generated by stringer -type=Move; DO NOT EDIT\n\npackage dep\n\nimport \"fmt\"\n\nconst _Move_name = \"ShiftLeftRightMAXMOVE\"\n\nvar _Move_index = [...]uint8{0, 5, 9, 14, 21}\n\nfunc (i Move) String() string {\n\tif i >= Move(len(_Move_index)-1) {\n\t\treturn fmt.Sprintf(\"Move(%d)\", i)\n\t}\n\treturn _Move_name[_Move_index[i]:_Move_index[i+1]]\n}\n"
  },
  {
    "path": "dep/nn2.go",
    "content": "package dep\n\nimport (\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/corpus\"\n\t\"github.com/pkg/errors\"\n\tG \"gorgonia.org/gorgonia\"\n\t\"gorgonia.org/tensor\"\n)\n\n// may is a simple monad for handling errors\ntype may struct {\n\terror\n\tn *G.Node\n}\n\nfunc (m *may) doUnary(fn func(*G.Node) (*G.Node, error)) {\n\tif m.error != nil {\n\t\treturn\n\t}\n\tm.n, m.error = fn(m.n)\n}\n\nfunc (m *may) doBinary(fn func(a, b *G.Node) (*G.Node, error), other *G.Node) {\n\tif m.error != nil {\n\t\treturn\n\t}\n\tm.n, m.error = fn(m.n, other)\n}\n\nfunc (m *may) doSwapBinary(fn func(a, b *G.Node) (*G.Node, error), other *G.Node) {\n\tif m.error != nil {\n\t\treturn\n\t}\n\tm.n, m.error = fn(other, m.n)\n}\n\ntype neuralnetwork2 struct {\n\tNNConfig\n\n\tg   *G.ExprGraph\n\tsub *G.ExprGraph\n\n\t// model\n\n\t// embedding matrices for word, POSTags and labels respectively\n\te_w *G.Node // Shape: (EmbeddingSize, DictSize)\n\te_t *G.Node // Shape: (EmbeddingSize, lingo.MAXTAG)\n\te_l *G.Node // Shape: (EmbeddingSize, lingo.MAXDEP)\n\n\t// w1\n\tw1_w *G.Node // Shape: (HiddenSize, DictSize)\n\tw1_t *G.Node // Shape: (HiddenSize, lingo.MAXTAG)\n\tw1_l *G.Node // Shape: (HiddenSize, lingo.MAXDEP)\n\tb    *G.Node // Shape: (HiddenSize)\n\n\t// w2\n\tw2 *G.Node // Shape: (MAXTRANSITION, HiddenSize)\n\n\t// selects\n\tx_wSelW G.Nodes // 18 - word features\n\tx_tSelT G.Nodes // 18 - POSTag features\n\tx_lSelL G.Nodes // 12 - Dependency feature\n\n\t// inputs (feature vectors built up from the selects)\n\tx_w *G.Node\n\tx_t *G.Node\n\tx_l *G.Node\n\n\t// outputs\n\tscores  *G.Node // argmax this to get the greedy decoded transition\n\tlogProb *G.Node\n\tcost    *G.Node\n\tcostVal G.Value\n\n\tvm     G.VM\n\tmodel  G.Nodes\n\tsolver G.Solver\n\n\tdict        *corpus.Corpus\n\ttransitions []transition\n\n\tcostChan chan G.Value\n\n\t// wordfeats *G.Node\n\t// tagfeats  *G.Node\n\t// depfeats  *G.Node\n\t// sumfeats  *G.Node\n\t// act       *G.Node\n}\n\nfunc (nn *neuralnetwork2) initialized() bool {\n\treturn nn.g != nil && nn.sub != nil &&\n\t\tnn.e_w != nil && nn.e_t != nil && nn.e_l != nil &&\n\t\tnn.w1_w != nil && nn.w1_t != nil && nn.w1_l != nil && nn.b != nil &&\n\t\tnn.w2 != nil && len(nn.x_wSelW) > 0 && len(nn.x_tSelT) > 0 && len(nn.x_lSelL) > 0 &&\n\t\tnn.x_w != nil && nn.x_t != nil && nn.x_l != nil &&\n\t\tnn.scores != nil &&\n\t\tnn.dict != nil && nn.vm != nil && nn.solver != nil\n}\n\nfunc (nn *neuralnetwork2) init() error {\n\tif nn.dict == nil {\n\t\treturn errors.Errorf(\"No Corpus Provided to the Neural Network. Will be unable to decode\")\n\t}\n\n\tg := G.NewGraph()\n\tnn.g = g\n\n\tword := nn.dict.Size()\n\ttags := int(lingo.MAXTAG)\n\tdeps := int(lingo.MAXDEPTYPE)\n\t// trns := len(nn.transitions)\n\n\twordFeats := POS_OFFSET - 0\n\ttagFeats := DEP_OFFSET - POS_OFFSET\n\tdepFeats := int(MAXFEATURE) - DEP_OFFSET\n\n\t// In any case a very very very small dict was passed in\n\t// we set the minimum to wordFeatss\n\tif word < wordFeats {\n\t\tword = wordFeats\n\t}\n\n\tlogf(`Word: %d\ntags: %d\ndeps: %d\nwordFeats: %d\ntagFeats: %d\ndepFeats: %d\n`, word, tags, deps, wordFeats, tagFeats, depFeats)\n\n\t// define inputs\n\tnn.x_w = G.NewVector(g, nn.Dtype, G.WithShape(wordFeats*nn.EmbeddingSize), G.WithName(\"word input\"), G.WithInit(G.Zeroes()))\n\tnn.x_t = G.NewVector(g, nn.Dtype, G.WithShape(tagFeats*nn.EmbeddingSize), G.WithName(\"POSTag input\"), G.WithInit(G.Zeroes()))\n\tnn.x_l = G.NewVector(g, nn.Dtype, G.WithShape(depFeats*nn.EmbeddingSize), G.WithName(\"word input\"), G.WithInit(G.Zeroes()))\n\n\tnn.x_wSelW = make(G.Nodes, wordFeats)\n\tnn.x_tSelT = make(G.Nodes, tagFeats)\n\tnn.x_lSelL = make(G.Nodes, depFeats)\n\n\t// define models\n\tnn.e_w = G.NewMatrix(g, nn.Dtype, G.WithShape(word, nn.EmbeddingSize), G.WithName(\"e_w\"), G.WithInit(G.GlorotU(1)))\n\tnn.e_t = G.NewMatrix(g, nn.Dtype, G.WithShape(tags, nn.EmbeddingSize), G.WithName(\"e_t\"), G.WithInit(G.GlorotU(1)))\n\tnn.e_l = G.NewMatrix(g, nn.Dtype, G.WithShape(deps, nn.EmbeddingSize), G.WithName(\"e_l\"), G.WithInit(G.GlorotU(1)))\n\n\tnn.w1_w = G.NewMatrix(g, nn.Dtype, G.WithShape(nn.HiddenSize, nn.EmbeddingSize*wordFeats), G.WithName(\"w1_w\"), G.WithInit(G.GlorotU(1)))\n\tnn.w1_t = G.NewMatrix(g, nn.Dtype, G.WithShape(nn.HiddenSize, nn.EmbeddingSize*tagFeats), G.WithName(\"w1_t\"), G.WithInit(G.GlorotU(1)))\n\tnn.w1_l = G.NewMatrix(g, nn.Dtype, G.WithShape(nn.HiddenSize, nn.EmbeddingSize*depFeats), G.WithName(\"w1_l\"), G.WithInit(G.GlorotU(1)))\n\tnn.b = G.NewVector(g, nn.Dtype, G.WithShape(nn.HiddenSize), G.WithName(\"b\"), G.WithInit(G.Zeroes()))\n\n\tnn.w2 = G.NewMatrix(g, nn.Dtype, G.WithShape(MAXTRANSITION, nn.HiddenSize), G.WithName(\"w2\"), G.WithInit(G.GlorotU(1)))\n\n\tnn.model = G.Nodes{nn.e_w, nn.e_t, nn.e_l, nn.w1_w, nn.w1_t, nn.w1_l, nn.b, nn.w2}\n\n\t// define selects\n\t// words first\n\tlogf(\"nn.e_w: %+1.1s\", nn.e_w.Value())\n\tvar err error\n\tfor i := 0; i < wordFeats; i++ {\n\t\tif nn.x_wSelW[i], err = G.Slice(nn.e_w, G.S(i)); err != nil { // dummy slices... they'll be replaced at runtime\n\t\t\treturn err\n\t\t}\n\n\t}\n\n\t// tag features\n\tfor i := 0; i < tagFeats; i++ {\n\t\tif nn.x_tSelT[i], err = G.Slice(nn.e_t, G.S(i)); err != nil { // dummy slices... they'll be replaced at runtime\n\t\t\treturn err\n\t\t}\n\t}\n\n\t// dependency features\n\tfor i := 0; i < depFeats; i++ {\n\t\tif nn.x_lSelL[i], err = G.Slice(nn.e_l, G.S(i)); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\n\t// forwards\n\tif err = nn.fwd(); err != nil {\n\t\treturn err\n\t}\n\n\t// backprop\n\tif _, err = G.Grad(nn.cost, nn.model...); err != nil {\n\t\treturn err\n\t}\n\n\tnn.sub = g.SubgraphRoots(nn.scores)\n\n\t// prog, locmap, err := G.Compile(nn.g)\n\t// if err != nil {\n\t// \treturn err\n\t// }\n\t// log.Printf(\"Prog: %v\", prog)\n\n\t// ioutil.WriteFile(\"graph.dot\", []byte(g.ToDot()), 0644)\n\n\t// logger := log.New(os.Stderr, \"\", 0)\n\t// nn.vm = G.NewTapeMachine(prog, locmap, G.BindDualValues(nn.model...), G.UseCudaFor(), G.WithLogger(logger), G.WithWatchlist())\n\t// nn.vm = G.NewTapeMachine(prog, locmap, G.BindDualValues(nn.model...), G.UseCudaFor())\n\tnn.vm = G.NewTapeMachine(nn.g, G.BindDualValues(nn.model...), G.UseCudaFor())\n\tG.BindDualValues(nn.scores)(nn.vm) // makes sure that scores is a *dualValue\n\tnn.solver = G.NewAdaGradSolver(G.WithLearnRate(nn.AdaAlpha), G.WithEps(nn.AdaEps), G.WithL2Reg(nn.Reg), G.WithBatchSize(float64(nn.BatchSize)))\n\t// nn.solver = G.NewVanillaSolver(G.WithLearnRate(nn.AdaAlpha), G.WithL2Reg(nn.Reg))\n\treturn nil\n}\n\nfunc (nn *neuralnetwork2) fwd() error {\n\tvar err error\n\n\t// build up x vectors\n\tif nn.x_w, err = G.Concat(0, nn.x_wSelW...); err != nil {\n\t\treturn err\n\t}\n\n\tif nn.x_t, err = G.Concat(0, nn.x_tSelT...); err != nil {\n\t\treturn err\n\t}\n\n\tif nn.x_l, err = G.Concat(0, nn.x_lSelL...); err != nil {\n\t\treturn err\n\t}\n\n\tlogf(\"w1_w %v, x_w %v\", nn.w1_w.Shape(), nn.x_w.Shape())\n\tm_w := &may{nil, nn.w1_w}\n\tm_w.doBinary(G.Mul, nn.x_w)\n\tif m_w.error != nil {\n\t\treturn m_w.error\n\t}\n\n\tlogf(\"w1_t %v, x_t %v\", nn.w1_t.Shape(), nn.x_t.Shape())\n\tm_t := &may{nil, nn.w1_t}\n\tm_t.doBinary(G.Mul, nn.x_t)\n\tif m_t.error != nil {\n\t\treturn m_t.error\n\t}\n\n\tlogf(\"w1_l %v, x_l %v\", nn.w1_l.Shape(), nn.x_l.Shape())\n\tm_l := &may{nil, nn.w1_l}\n\tm_l.doBinary(G.Mul, nn.x_l)\n\tif m_l.error != nil {\n\t\treturn m_l.error\n\t}\n\n\t// add and activate layer 1\n\tlogf(\"w : %v\", m_w.n.Shape())\n\tm_w1 := &may{nil, m_w.n}\n\tm_w1.doBinary(G.Add, m_t.n)\n\tm_w1.doBinary(G.Add, m_l.n)\n\tm_w1.doBinary(G.Add, nn.b)\n\tm_w1.doUnary(G.Cube)\n\tif m_w1.error != nil {\n\t\treturn m_w1.error\n\t}\n\n\tif nn.Dropout > 0 {\n\t\tlogf(\"Doing dropout\")\n\t\tm_w1.n, m_w1.error = G.Dropout(m_w1.n, nn.Dropout)\n\t\tif m_w1.error != nil {\n\t\t\treturn m_w1.error\n\t\t}\n\t}\n\n\t// go to softmax layer\n\tlogf(\"w2: %v, w1act: %v\", nn.w2.Shape(), m_w1.n.Shape())\n\tm_sm := &may{nil, nn.w2}\n\tm_sm.doBinary(G.Mul, m_w1.n)\n\tnn.scores = m_sm.n\n\tm_sm.doUnary(G.SoftMax)\n\tif m_sm.error != nil {\n\t\treturn m_sm\n\t}\n\n\tnn.logProb = m_sm.n\n\t// G.WithName(\"Logprob\")(nn.logProb)\n\t// log.Printf(\"LOGPROB %v %p %v\", nn.logProb, nn.logProb, nn.logProb)\n\tif nn.cost, err = G.Slice(nn.logProb, G.S(0)); err != nil { // slice is a dummy tensor.Slice. It'll be replaced at runtime\n\t\treturn err\n\t}\n\n\tG.Read(nn.cost, &nn.costVal)\n\treturn nil\n}\n\nfunc (nn *neuralnetwork2) costProgress() <-chan G.Value {\n\tif nn.costChan == nil {\n\t\tnn.costChan = make(chan G.Value)\n\t}\n\treturn nn.costChan\n}\n\n// train does one epoch of training. The examples are batched.\nfunc (nn *neuralnetwork2) train(examples []example) error {\n\tsize := len(examples)\n\tbatches := size / nn.BatchSize\n\n\tvar start, end int\n\tif nn.BatchSize > size {\n\t\tbatches = 1\n\t\tend = size\n\t\tG.WithBatchSize(float64(size))(nn.solver) // set it such that the solver doesn't get confused\n\t} else {\n\t\tend = nn.BatchSize\n\t}\n\n\tfor batch := 0; batch < batches; batch++ {\n\t\tfor _, ex := range examples[start:end] {\n\t\t\tnn.feats2vec(ex.features)\n\t\t\ttid := lookupTransition(ex.transition, nn.transitions)\n\n\t\t\tif err := G.UnsafeLet(nn.cost, G.S(tid)); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\n\t\t\tif err := nn.vm.RunAll(); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\n\t\t\tnn.vm.Reset()\n\t\t}\n\t\tif err := nn.solver.Step(G.NodesToValueGrads(nn.model)); err != nil {\n\t\t\terr = errors.Wrapf(err, \"Stepping on the model failed %v\", batch)\n\t\t\treturn err\n\t\t}\n\n\t\tif nn.costChan != nil {\n\t\t\tnn.costChan <- nn.costVal\n\t\t}\n\n\t\tstart = end\n\t\tif start >= size {\n\t\t\tbreak\n\t\t}\n\t\tend += nn.BatchSize\n\t\tif end >= size {\n\t\t\tend = size\n\t\t}\n\t}\n\n\treturn nil\n}\n\n// pred predicts the index of the transitions\nfunc (nn *neuralnetwork2) pred(ind []int) (int, error) {\n\tnn.feats2vec(ind)\n\n\t// f, _ := os.OpenFile(\"LOOOOOG\", os.O_APPEND|os.O_CREATE|os.O_RDWR, 0644)\n\t// logger := log.New(f, \"\", 0)\n\t// logger := log.New(os.Stderr, \"\", 0)\n\n\t// m := G.NewLispMachine(nn.sub, G.ExecuteFwdOnly(), G.WithLogger(logger), G.WithWatchlist(), G.LogBothDir(), G.WithValueFmt(\"%+3.3v\"))\n\tm := G.NewLispMachine(nn.sub, G.ExecuteFwdOnly())\n\tif err := m.RunAll(); err != nil {\n\t\treturn 0, err\n\t}\n\t// logger.Println(\"========================\\n\")\n\n\tval := nn.scores.Value().(tensor.Tensor)\n\tt, err := tensor.Argmax(val, tensor.AllAxes)\n\tif err != nil {\n\t\treturn 0, err\n\t}\n\n\treturn t.ScalarValue().(int), nil\n}\n\n// utility function\n\nfunc (nn *neuralnetwork2) feats2vec(indicators []int) error {\n\t// fix word features\n\tfor i, ind := range indicators[:POS_OFFSET] {\n\t\tif err := G.UnsafeLet(nn.x_wSelW[i], G.S(ind-wordFeatsStartAt)); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\n\t// fix tag features\n\tfor i, ind := range indicators[POS_OFFSET:DEP_OFFSET] {\n\t\tif err := G.UnsafeLet(nn.x_tSelT[i], G.S(ind)); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\n\tfor i, ind := range indicators[DEP_OFFSET:] {\n\t\tif err := G.UnsafeLet(nn.x_lSelL[i], G.S(ind-labelFeatsStartAt)); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\n\treturn nil\n}\n"
  },
  {
    "path": "dep/nn2_io.go",
    "content": "package dep\n\nimport (\n\t\"bytes\"\n\t\"encoding/gob\"\n\t\"fmt\"\n\n\t\"github.com/pkg/errors\"\n\tG \"gorgonia.org/gorgonia\"\n\tT \"gorgonia.org/tensor\"\n)\n\nvar empty struct{}\n\nfunc (nn *neuralnetwork2) String() string {\n\ts := `Config\n------\n%v\nInfo\n------\nEmbeddings_Word       : %v\nEmbeddings_POStag     : %v\nEmbeddings_Dependency : %v\nSelects_Words         : %d\nSelects_POSTag        : %d\nSelects_Dependency    : %d\nWeights1_Word         : %v\nWeights1_POSTag       : %v\nWeights1_Dependency   : %v\nBiases                : %v\nWeights2              : %v\n`\n\n\treturn fmt.Sprintf(s, nn.NNConfig,\n\t\tnn.e_w.Shape(), nn.e_t.Shape(), nn.e_l.Shape(),\n\t\tlen(nn.x_wSelW), len(nn.x_tSelT), len(nn.x_lSelL),\n\t\tnn.w1_w.Shape(), nn.w1_t.Shape(), nn.w1_l.Shape(),\n\t\tnn.b.Shape(), nn.w2.Shape())\n}\n\nfunc (nn *neuralnetwork2) GobEncode() ([]byte, error) {\n\tif !nn.initialized() {\n\t\treturn nil, errors.Errorf(\"Neural network not initialized. Cannot gob\")\n\t}\n\n\tvar buf bytes.Buffer\n\tencoder := gob.NewEncoder(&buf)\n\n\tif err := encoder.Encode(nn.NNConfig); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(nn.e_w.Value()); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(nn.e_t.Value()); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(nn.e_l.Value()); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(nn.w1_w.Value()); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(nn.w1_t.Value()); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(nn.w1_l.Value()); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(nn.b.Value()); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(nn.w2.Value()); err != nil {\n\t\treturn nil, err\n\t}\n\treturn buf.Bytes(), nil\n}\n\nfunc (nn *neuralnetwork2) GobDecode(buf []byte) error {\n\t// prechecks\n\tif nn.dict == nil {\n\t\treturn errors.Errorf(\"Neural Network has no corpus attached to it (Corpuses are serialized separately).\")\n\t}\n\n\tb := bytes.NewBuffer(buf)\n\tdecoder := gob.NewDecoder(b)\n\n\tif err := decoder.Decode(&nn.NNConfig); err != nil {\n\t\treturn err\n\t}\n\n\tif err := nn.init(); err != nil {\n\t\treturn err\n\t}\n\n\te_w := T.New(T.Of(nn.Dtype), T.WithShape(nn.e_w.Shape()...))\n\tif err := decoder.Decode(e_w); err != nil {\n\t\treturn err\n\t}\n\tG.Let(nn.e_w, e_w)\n\n\te_t := T.New(T.Of(nn.Dtype), T.WithShape(nn.e_t.Shape()...))\n\tif err := decoder.Decode(e_t); err != nil {\n\t\treturn err\n\t}\n\tG.Let(nn.e_t, e_t)\n\n\te_l := T.New(T.Of(nn.Dtype), T.WithShape(nn.e_l.Shape()...))\n\tif err := decoder.Decode(e_l); err != nil {\n\t\treturn err\n\t}\n\tG.Let(nn.e_l, e_l)\n\n\tw1_w := T.New(T.Of(nn.Dtype), T.WithShape(nn.w1_w.Shape()...))\n\tif err := decoder.Decode(w1_w); err != nil {\n\t\treturn err\n\t}\n\tG.Let(nn.w1_w, w1_w)\n\n\tw1_t := T.New(T.Of(nn.Dtype), T.WithShape(nn.w1_t.Shape()...))\n\tif err := decoder.Decode(w1_t); err != nil {\n\t\treturn err\n\t}\n\tG.Let(nn.w1_t, w1_t)\n\n\tw1_l := T.New(T.Of(nn.Dtype), T.WithShape(nn.w1_l.Shape()...))\n\tif err := decoder.Decode(w1_l); err != nil {\n\t\treturn err\n\t}\n\tG.Let(nn.w1_l, w1_l)\n\n\tbias := T.New(T.Of(nn.Dtype), T.WithShape(nn.b.Shape()...))\n\tif err := decoder.Decode(bias); err != nil {\n\t\treturn err\n\t}\n\tG.Let(nn.b, bias)\n\n\tw2 := T.New(T.Of(nn.Dtype), T.WithShape(nn.w2.Shape()...))\n\tif err := decoder.Decode(w2); err != nil {\n\t\treturn err\n\t}\n\tG.Let(nn.w2, w2)\n\n\treturn nil\n}\n"
  },
  {
    "path": "dep/nn2_io_test.go",
    "content": "package dep\n\nimport (\n\t\"bytes\"\n\t\"encoding/gob\"\n\t\"fmt\"\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/corpus\"\n\tG \"gorgonia.org/gorgonia\"\n)\n\nfunc TestNNIO(t *testing.T) {\n\tsts := allSentences()\n\tnn := new(neuralnetwork2)\n\tnn.NNConfig = DefaultNNConfig\n\tnn.dict = corpus.GenerateCorpus(sts)\n\tnn.transitions = transitions\n\n\tif err := nn.init(); err != nil {\n\t\tt.Fatalf(\"%+v\", err)\n\t}\n\n\ts := `Config\n------\nBatch Size               : 10000\nDropout Rate             : 0.500000\nAdaGrad Eps (ε)          : 0.000001\nAdaGrad Learn Rate (η)   : 0.010000\nRegularization Parameter : 0.000002\nHidden Layer Size        : 200\nEmbedding Size           : 50\nNumber Precomputed       : 30000\n\nEvaluate Per 100 Iterations\nClear Gradients Per 0 Iterations\nDtype: float64\n\nInfo\n------\nEmbeddings_Word       : (74, 50)\nEmbeddings_POStag     : (%d, 50)\nEmbeddings_Dependency : (%d, 50)\nSelects_Words         : 18\nSelects_POSTag        : 18\nSelects_Dependency    : 12\nWeights1_Word         : (200, 900)\nWeights1_POSTag       : (200, 900)\nWeights1_Dependency   : (200, 600)\nBiases                : (200)\nWeights2              : (%d, 200)\n`\n\n\tcorrectDesc := fmt.Sprintf(s, lingo.MAXTAG, lingo.MAXDEPTYPE, MAXTRANSITION)\n\tif nn.String() != correctDesc {\n\t\tt.Errorf(\"Oops. Got %q. Want %q\", nn.String(), correctDesc)\n\t}\n\t// nn.Dtype = tensor.Float32\n\n\tvar buf bytes.Buffer\n\tencoder := gob.NewEncoder(&buf)\n\tif err := encoder.Encode(nn); err != nil {\n\t\tt.Fatalf(\"%+v\", err)\n\t}\n\n\tdecoder := gob.NewDecoder(&buf)\n\tnn2 := new(neuralnetwork2)\n\tnn2.dict = corpus.GenerateCorpus(sts)\n\tnn2.transitions = transitions\n\tif err := decoder.Decode(nn2); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tif nn.String() != correctDesc {\n\t\tt.Fatalf(\"Oops. Got %q. Want %q\", nn.String(), correctDesc)\n\t}\n\n\tif !G.ValueEq(nn.e_w.Value(), nn2.e_w.Value()) {\n\t\tt.Errorf(\"Expected e_w to be the same. Expected %1.1s. Got %1.1s\", nn.e_w.Value(), nn2.e_w.Value())\n\t}\n\n\tif !G.ValueEq(nn.e_t.Value(), nn2.e_t.Value()) {\n\t\tt.Errorf(\"Expected e_t to be the same. Expected %1.1s. Got %1.1s\", nn.e_t.Value(), nn2.e_t.Value())\n\t}\n\n\tif !G.ValueEq(nn.e_l.Value(), nn2.e_l.Value()) {\n\t\tt.Errorf(\"Expected e_l to be the same. Expected %1.1s. Got %1.1s\", nn.e_l.Value(), nn2.e_l.Value())\n\t}\n\n\tif !G.ValueEq(nn.w1_w.Value(), nn2.w1_w.Value()) {\n\t\tt.Errorf(\"Expected w1_w to be the same. Expected %1.1s. Got %1.1s\", nn.w1_w.Value(), nn2.w1_w.Value())\n\t}\n\n\tif !G.ValueEq(nn.w1_t.Value(), nn2.w1_t.Value()) {\n\t\tt.Errorf(\"Expected w1_t to be the same. Expected %1.1s. Got %1.1s\", nn.w1_t.Value(), nn2.w1_t.Value())\n\t}\n\n\tif !G.ValueEq(nn.w1_l.Value(), nn2.w1_l.Value()) {\n\t\tt.Errorf(\"Expected w1_l to be the same. Expected %1.1s. Got %1.1s\", nn.w1_l.Value(), nn2.w1_l.Value())\n\t}\n\n\tif !G.ValueEq(nn.b.Value(), nn2.b.Value()) {\n\t\tt.Errorf(\"Expected b to be the same. Expected %1.1s. Got %1.1s\", nn.b.Value(), nn2.b.Value())\n\t}\n\n\tif !G.ValueEq(nn.w2.Value(), nn2.w2.Value()) {\n\t\tt.Errorf(\"Expected w2 to be the same. Expected %1.1s. Got %1.1s\", nn.w2.Value(), nn2.w2.Value())\n\t}\n\n\tt.Logf(\"Visual Inspection: \\n%+1.8s\\n%+1.8s\", nn.e_w.Value(), nn2.e_w.Value())\n\n\t// special case\n\tbuf.Reset()\n\tencoder = gob.NewEncoder(&buf)\n\tif err := encoder.Encode(nn); err != nil {\n\t\tt.Fatalf(\"%+v\", err)\n\t}\n\tdecoder = gob.NewDecoder(&buf)\n\tnn3 := new(neuralnetwork2)\n\tif err := decoder.Decode(nn3); err == nil {\n\t\tt.Error(\"Expected a nocorpus error\")\n\t}\n}\n"
  },
  {
    "path": "dep/nn2_test.go",
    "content": "package dep\n\nimport (\n\t\"math/rand\"\n\t\"testing\"\n\t\"time\"\n\n\t\"github.com/chewxy/lingo/corpus\"\n\t\"gorgonia.org/gorgonia\"\n)\n\nfunc TestNN2(t *testing.T) {\n\trand.Seed(1337)\n\n\t// we test 50 iterations unless the short flag is passed in\n\tepochs := 50\n\tif testing.Short() {\n\t\tepochs = 10\n\t}\n\n\tsts := allSentences()\n\tnn := new(neuralnetwork2)\n\tnn.NNConfig = DefaultNNConfig\n\tnn.Dtype = gorgonia.Float32\n\tnn.dict = corpus.GenerateCorpus(sts)\n\tnn.transitions = transitions\n\n\tif err := nn.init(); err != nil {\n\t\tt.Fatalf(\"%+v\", err)\n\t}\n\n\tvar costs []float64\n\tch := nn.costProgress()\n\tsigChan := make(chan struct{})\n\n\tgo func(ch <-chan gorgonia.Value, sig chan struct{}) {\n\t\tfor cost := range ch {\n\t\t\tswitch c := cost.Data().(type) {\n\t\t\tcase float32:\n\t\t\t\tcosts = append(costs, float64(c))\n\t\t\tcase float64:\n\t\t\t\tcosts = append(costs, c)\n\t\t\t}\n\n\t\t\tt.Logf(\"Cost %v\", cost)\n\t\t}\n\t\tsig <- struct{}{}\n\t}(ch, sigChan)\n\n\texs := makeExamples(sts, nn.NNConfig, nn.dict, transitions, dummyFix{})\n\n\tstart := time.Now()\n\tfor i := 0; i < epochs; i++ {\n\t\tif err := nn.train(exs); err != nil {\n\t\t\tt.Errorf(\"%+v\", err)\n\t\t}\n\t\tshuffleExamples(exs)\n\t}\n\t// simulate what *DependencyParser would do\n\tclose(nn.costChan)\n\tnn.costChan = nil\n\n\tt.Logf(\"Training %d iterations took Taken: %v\", epochs, time.Since(start))\n\n\t<-sigChan\n\tif len(costs) == 0 {\n\t\tt.Error(\"Expected some costs\")\n\t}\n\tif costs[0] <= costs[len(costs)-1] {\n\t\tt.Error(\"Expected costs to have reduced during training\")\n\t}\n\n\t// PREDICTION TIME!\n\n\tss2 := simpleSentence()\n\texs = makeExamples(ss2, nn.NNConfig, nn.dict, transitions, dummyFix{})\n\tstart = time.Now()\n\tfor i, ex := range exs {\n\t\tind, err := nn.pred(ex.features)\n\t\tif err != nil {\n\t\t\tt.Errorf(\"Example %d failed: %v\", i, err)\n\t\t\tcontinue\n\t\t}\n\n\t\tt.Logf(\"Example %d. Want: %v. Got %v. Same: %t\", i, ex.transition, transitions[ind], ex.transition == transitions[ind])\n\t}\n\tt.Logf(\"Pred Time Taken: %v\", time.Since(start))\n}\n"
  },
  {
    "path": "dep/nnconfig.go",
    "content": "package dep\n\nimport (\n\t\"bytes\"\n\t\"encoding/gob\"\n\t\"fmt\"\n\n\t\"github.com/pkg/errors\"\n\t\"gorgonia.org/tensor\"\n)\n\n// NNConfig configures the neural network\ntype NNConfig struct {\n\tBatchSize                  int     // 10000\n\tDropout                    float64 // 0.5\n\tAdaEps                     float64 // 1e-6\n\tAdaAlpha                   float64 //0.02\n\tReg                        float64 // 1e-8\n\tHiddenSize                 int     // 200\n\tEmbeddingSize              int     // 50\n\tNumPrecomputed             int     //100000\n\tEvalPerIteration           int     // 100\n\tClearGradientsPerIteration int     // 0\n\n\tDtype tensor.Dtype\n}\n\nfunc (c NNConfig) String() string {\n\ts := `Batch Size               : %d\nDropout Rate             : %f\nAdaGrad Eps (ε)          : %f\nAdaGrad Learn Rate (η)   : %f\nRegularization Parameter : %f\nHidden Layer Size        : %d\nEmbedding Size           : %d\nNumber Precomputed       : %d\n\nEvaluate Per %d Iterations\nClear Gradients Per %d Iterations\nDtype: %v\n`\n\treturn fmt.Sprintf(s, c.BatchSize, c.Dropout, c.AdaEps, c.AdaAlpha, c.Reg, c.HiddenSize, c.EmbeddingSize, c.NumPrecomputed, c.EvalPerIteration, c.ClearGradientsPerIteration, c.Dtype)\n}\n\n// DefaultNNConfig is the default config that is passed in, for initialization purposses.\nvar DefaultNNConfig NNConfig\n\nfunc (c NNConfig) GobEncode() ([]byte, error) {\n\tvar buf bytes.Buffer\n\tencoder := gob.NewEncoder(&buf)\n\tencoder.Encode(c.BatchSize)\n\tencoder.Encode(c.Dropout)\n\tencoder.Encode(c.AdaEps)\n\tencoder.Encode(c.AdaAlpha)\n\tencoder.Encode(c.Reg)\n\tencoder.Encode(c.HiddenSize)\n\tencoder.Encode(c.EmbeddingSize)\n\tencoder.Encode(c.NumPrecomputed)\n\tencoder.Encode(c.EvalPerIteration)\n\tencoder.Encode(c.ClearGradientsPerIteration)\n\n\tswitch c.Dtype {\n\tcase tensor.Float64:\n\t\tencoder.Encode(byte(0))\n\tcase tensor.Float32:\n\t\tencoder.Encode(byte(1))\n\tdefault:\n\t\treturn nil, errors.Errorf(\"Unsupported Dtype to be GobEncoded\")\n\t}\n\treturn buf.Bytes(), nil\n}\n\nfunc (c *NNConfig) GobDecode(p []byte) error {\n\tb := bytes.NewBuffer(p)\n\tdecoder := gob.NewDecoder(b)\n\n\tdecoder.Decode(&c.BatchSize)\n\tdecoder.Decode(&c.Dropout)\n\tdecoder.Decode(&c.AdaEps)\n\tdecoder.Decode(&c.AdaAlpha)\n\tdecoder.Decode(&c.Reg)\n\tdecoder.Decode(&c.HiddenSize)\n\tdecoder.Decode(&c.EmbeddingSize)\n\tdecoder.Decode(&c.NumPrecomputed)\n\tdecoder.Decode(&c.EvalPerIteration)\n\tdecoder.Decode(&c.ClearGradientsPerIteration)\n\n\tvar bite byte\n\tdecoder.Decode(&bite)\n\tswitch bite {\n\tcase 0:\n\t\tc.Dtype = tensor.Float64\n\tcase 1:\n\t\tc.Dtype = tensor.Float32\n\tdefault:\n\t\treturn errors.Errorf(\"Unsupported Dtype to be GobDecoded: %v\", bite)\n\t}\n\treturn nil\n}\n\nfunc init() {\n\tDefaultNNConfig = NNConfig{\n\t\tBatchSize: 10000,\n\t\tDropout:   0.5,\n\n\t\tAdaEps:   1e-6,\n\t\tAdaAlpha: 0.01,\n\n\t\tReg: 1.5e-6,\n\n\t\tHiddenSize:     200,\n\t\tEmbeddingSize:  50,\n\t\tNumPrecomputed: 30000,\n\n\t\tEvalPerIteration:           100,\n\t\tClearGradientsPerIteration: 0,\n\n\t\tDtype: tensor.Float64,\n\t\t// Dtype: gorgonia.Float32,\n\t}\n}\n"
  },
  {
    "path": "dep/release.go",
    "content": "// +build !debug\n\npackage dep\n\nconst BUILD_DEBUG = \"PARSER: RELEASE BUILD\"\nconst BUILD_DIAG = \"Non-Diagnostic Build\"\n\nconst DEBUG = false\n\nvar READMEMSTATS = false\n\nvar TABCOUNT uint32 = 0\n\nfunc enterLoggingContext() {}\n\nfunc leaveLoggingContext() {}\n\nfunc logTrainingProgress(iteration, correct, total, length, possibles int) {}\n\nfunc logMemStats() {}\n\nfunc logf(format string, others ...interface{}) {}\n\nfunc recoverFrom(format string, attrs ...interface{}) {}\n\nfunc (d *Parser) SprintFeatures(feature []int) string { return \"\" }\n\nfunc SprintScores(scores []float64, ts []transition) string { return \"\" }\n"
  },
  {
    "path": "dep/span.go",
    "content": "package dep\n\ntype span struct {\n\tstart, end int\n}\n\nfunc makeSpan(start, end int) span {\n\tif end <= start {\n\t\tpanic(\"Impossible span created\")\n\t}\n\treturn span{start, end}\n}\n\nfunc (s span) combine(other span) span {\n\tstart := minInt(s.start, other.start)\n\tend := maxInt(s.end, other.end)\n\treturn span{start, end}\n}\n"
  },
  {
    "path": "dep/test_test.go",
    "content": "package dep\n\nimport (\n\t\"bufio\"\n\t\"crypto/md5\"\n\t\"encoding/gob\"\n\t\"fmt\"\n\t\"io\"\n\t\"log\"\n\t\"os\"\n\t\"strings\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/treebank\"\n\t\"github.com/kljensen/snowball\"\n)\n\ntype dummyLem struct{}\n\nfunc (dummyLem) Lemmatize(s string, pt lingo.POSTag) ([]string, error) {\n\treturn nil, componentUnavailable(\"lemmatizer\")\n}\n\ntype dummyStemmer struct{}\n\nfunc (dummyStemmer) Stem(s string) (string, error) {\n\treturn snowball.Stem(s, \"english\", true)\n}\n\ntype dummyFix struct {\n\tdummyStemmer\n\tdummyLem\n}\n\nfunc (dummyFix) Clusters() (map[string]lingo.Cluster, error) {\n\treturn nil, componentUnavailable(\"clusters\")\n}\n\nconst nnps = `1\tGuerrillas\tguerrilla\tNOUN\tNNS\tNumber=Plur\t2\tnsubj\t_\t_\n2\tthreatened\tthreaten\tVERB\tVBD\tMood=Ind|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n3\tto\tto\tPART\tTO\t_\t4\tmark\t_\t_\n4\tassassinate\tassassinate\tVERB\tVB\tVerbForm=Inf\t2\txcomp\t_\t_\n5\tPrime\tPrime\tPROPN\tNNP\tNumber=Sing\t6\tcompound\t_\t_\n6\tMinister\tMinister\tPROPN\tNNP\tNumber=Sing\t8\tcompound\t_\t_\n7\tIyad\tIyad\tPROPN\tNNP\tNumber=Sing\t8\tcompound\t_\t_\n8\tAllawi\tAllawi\tPROPN\tNNP\tNumber=Sing\t4\tdobj\t_\t_\n9\tand\tand\tCONJ\tCC\t_\t8\tcc\t_\t_\n10\tMinister\tMinister\tPROPN\tNNP\tNumber=Sing\t14\tcompound\t_\t_\n11\tof\tof\tADP\tIN\t_\t12\tcase\t_\t_\n12\tDefense\tDefense\tPROPN\tNNP\tNumber=Sing\t10\tnmod\t_\t_\n13\tHazem\tHazem\tPROPN\tNNP\tNumber=Sing\t14\tcompound\t_\t_\n14\tShaalan\tShaalan\tPROPN\tNNP\tNumber=Sing\t8\tconj\t_\t_\n15\tin\tin\tADP\tIN\t_\t16\tcase\t_\t_\n16\tretaliation\tretaliation\tNOUN\tNN\tNumber=Sing\t4\tnmod\t_\t_\n17\tfor\tfor\tADP\tIN\t_\t19\tcase\t_\t_\n18\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t19\tdet\t_\t_\n19\tattack\tattack\tNOUN\tNN\tNumber=Sing\t16\tnmod\t_\t_\n20\t.\t.\tPUNCT\t.\t_\t2\tpunct\t_\t_\n\n`\nconst simple = `1\tYet\tyet\tCONJ\tCC\t_\t5\tcc\t_\t_\n2\twe\twe\tPRON\tPRP\tCase=Nom|Number=Plur|Person=1|PronType=Prs\t5\tnsubj\t_\t_\n3\tdid\tdo\tAUX\tVBD\tMood=Ind|Tense=Past|VerbForm=Fin\t5\taux\t_\t_\n4\tn't\tnot\tPART\tRB\t_\t5\tneg\t_\t_\n5\tcharge\tcharge\tVERB\tVB\tVerbForm=Inf\t0\troot\t_\t_\n6\tthem\tthey\tPRON\tPRP\tCase=Acc|Number=Plur|Person=3|PronType=Prs\t5\tdobj\t_\t_\n7\tfor\tfor\tADP\tIN\t_\t9\tcase\t_\t_\n8\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t9\tdet\t_\t_\n9\tevacuation\tevacuation\tNOUN\tNN\tNumber=Sing\t5\tnmod\t_\t_\n10\t.\t.\tPUNCT\t.\t_\t5\tpunct\t_\t_\n\n`\n\nconst med = `1\tPresident\tPresident\tPROPN\tNNP\tNumber=Sing\t2\tcompound\t_\t_\n2\tBush\tBush\tPROPN\tNNP\tNumber=Sing\t5\tnsubj\t_\t_\n3\ton\ton\tADP\tIN\t_\t4\tcase\t_\t_\n4\tTuesday\tTuesday\tPROPN\tNNP\tNumber=Sing\t5\tnmod\t_\t_\n5\tnominated\tnominate\tVERB\tVBD\tMood=Ind|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\ttwo\ttwo\tNUM\tCD\tNumType=Card\t7\tnummod\t_\t_\n7\tindividuals\tindividual\tNOUN\tNNS\tNumber=Plur\t5\tdobj\t_\t_\n8\tto\tto\tPART\tTO\t_\t9\tmark\t_\t_\n9\treplace\treplace\tVERB\tVB\tVerbForm=Inf\t5\tadvcl\t_\t_\n10\tretiring\tretire\tVERB\tVBG\tVerbForm=Ger\t11\tamod\t_\t_\n11\tjurists\tjurist\tNOUN\tNNS\tNumber=Plur\t9\tdobj\t_\t_\n12\ton\ton\tADP\tIN\t_\t14\tcase\t_\t_\n13\tfederal\tfederal\tADJ\tJJ\tDegree=Pos\t14\tamod\t_\t_\n14\tcourts\tcourt\tNOUN\tNNS\tNumber=Plur\t11\tnmod\t_\t_\n15\tin\tin\tADP\tIN\t_\t18\tcase\t_\t_\n16\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t18\tdet\t_\t_\n17\tWashington\tWashington\tPROPN\tNNP\tNumber=Sing\t18\tcompound\t_\t_\n18\tarea\tarea\tNOUN\tNN\tNumber=Sing\t14\tnmod\t_\t_\n19\t.\t.\tPUNCT\t.\t_\t5\tpunct\t_\t_\n\n`\n\nconst long = `1\tNow\tnow\tADV\tRB\t_\t5\tadvmod\t_\t_\n2\t,\t,\tPUNCT\t,\t_\t5\tpunct\t_\t_\n3\tI\tI\tPRON\tPRP\tCase=Nom|Number=Sing|Person=1|PronType=Prs\t5\tnsubj\t_\t_\n4\twould\twould\tAUX\tMD\tVerbForm=Fin\t5\taux\t_\t_\n5\targue\targue\tVERB\tVB\tVerbForm=Inf\t0\troot\t_\t_\n6\tthat\tthat\tSCONJ\tIN\t_\t11\tmark\t_\t_\n7\tone\tone\tPRON\tPRP\t_\t11\tnsubj\t_\t_\n8\tcould\tcould\tAUX\tMD\tVerbForm=Fin\t11\taux\t_\t_\n9\thave\thave\tAUX\tVB\tVerbForm=Inf\t11\taux\t_\t_\n10\treasonably\treasonably\tADV\tRB\t_\t11\tadvmod\t_\t_\n11\tpredicted\tpredict\tVERB\tVBN\tTense=Past|VerbForm=Part\t5\tccomp\t_\t_\n12\tthat\tthat\tSCONJ\tIN\t_\t19\tmark\t_\t_\n13\tsome\tsome\tDET\tDT\t_\t14\tdet\t_\t_\n14\tform\tform\tNOUN\tNN\tNumber=Sing\t19\tnsubj\t_\t_\n15\tof\tof\tADP\tIN\t_\t17\tcase\t_\t_\n16\tmilitary\tmilitary\tADJ\tJJ\tDegree=Pos\t17\tamod\t_\t_\n17\tviolence\tviolence\tNOUN\tNN\tNumber=Sing\t14\tnmod\t_\t_\n18\twas\tbe\tVERB\tVBD\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t19\tcop\t_\t_\n19\tlikely\tlikely\tADJ\tJJ\tDegree=Pos\t11\tccomp\t_\t_\n20\tto\tto\tPART\tTO\t_\t21\tmark\t_\t_\n21\toccur\toccur\tVERB\tVB\tVerbForm=Inf\t19\txcomp\t_\t_\n22\tin\tin\tADP\tIN\t_\t23\tcase\t_\t_\n23\tLebanon\tLebanon\tPROPN\tNNP\tNumber=Sing\t21\tnmod\t_\t_\n24\t-LRB-\t-lrb-\tPUNCT\t-LRB-\t_\t25\tpunct\t_\t_\n25\tconsidering\tconsider\tVERB\tVBG\tVerbForm=Ger\t19\tadvcl\t_\t_\n26\tthat\tthat\tSCONJ\tIN\t_\t31\tmark\t_\t_\n27\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t28\tdet\t_\t_\n28\tcountry\tcountry\tNOUN\tNN\tNumber=Sing\t31\tnsubj\t_\t_\n29\thas\thave\tAUX\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t31\taux\t_\t_\n30\tbeen\tbe\tAUX\tVBN\tTense=Past|VerbForm=Part\t31\taux\t_\t_\n31\texperiencing\texperience\tVERB\tVBG\tTense=Pres|VerbForm=Part\t25\tccomp\t_\t_\n32\tsome\tsome\tDET\tDT\t_\t33\tdet\t_\t_\n33\tform\tform\tNOUN\tNN\tNumber=Sing\t31\tdobj\t_\t_\n34\tof\tof\tADP\tIN\t_\t35\tcase\t_\t_\n35\tconflict\tconflict\tNOUN\tNN\tNumber=Sing\t33\tnmod\t_\t_\n36\tfor\tfor\tADP\tIN\t_\t41\tcase\t_\t_\n37\tapproximately\tapproximately\tADV\tRB\t_\t41\tadvmod\t_\t_\n38\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t41\tdet\t_\t_\n39\tlast\tlast\tADJ\tJJ\tDegree=Pos\t41\tamod\t_\t_\n40\t32\t32\tNUM\tCD\tNumType=Card\t41\tnummod\t_\t_\n41\tyears\tyear\tNOUN\tNNS\tNumber=Plur\t31\tnmod\t_\t_\n42\t-RRB-\t-rrb-\tPUNCT\t-RRB-\t_\t25\tpunct\t_\t_\n43\t.\t.\tPUNCT\t.\t_\t5\tpunct\t_\t_\n\n`\n\nconst cvconllu = `1\tGoogle\tGoogle\tPROPN\tNNP\tNumber=Sing\t6\tnsubj\t_\t_\n2\tis\tbe\tVERB\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t6\tcop\t_\t_\n3\ta\ta\tDET\tDT\tDefinite=Ind|PronType=Art\t6\tdet\t_\t_\n4\tnice\tnice\tADJ\tJJ\tDegree=Pos\t6\tamod\t_\t_\n5\tsearch\tsearch\tNOUN\tNN\tNumber=Sing\t6\tcompound\t_\t_\n6\tengine\tengine\tNOUN\tNN\tNumber=Sing\t0\troot\t_\t_\n7\t.\t.\tPUNCT\t.\t_\t6\tpunct\t_\t_\n\n1\tDoes\tdo\tAUX\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t3\taux\t_\t_\n2\tanybody\tanybody\tNOUN\tNN\tNumber=Sing\t3\tnsubj\t_\t_\n3\tuse\tuse\tVERB\tVB\tVerbForm=Inf\t0\troot\t_\t_\n4\tit\tit\tPRON\tPRP\tCase=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs\t3\tdobj\t_\t_\n5\tfor\tfor\tADP\tIN\t_\t6\tcase\t_\t_\n6\tanything\tanything\tNOUN\tNN\tNumber=Sing\t3\tnmod\t_\t_\n7\telse\telse\tADJ\tJJ\tDegree=Pos\t6\tamod\t_\t_\n8\t?\t?\tPUNCT\t.\t_\t3\tpunct\t_\t_\n\n`\n\nfunc lotsaNNP() *lingo.Dependency {\n\treadr := strings.NewReader(nnps)\n\tsentenceTags := treebank.ReadConllu(readr)\n\n\treturn sentenceTags[0].Dependency(dummyFix{})\n}\n\n// simpleSentence has 10 words\nfunc simpleSentence() []treebank.SentenceTag {\n\treadr := strings.NewReader(simple)\n\treturn treebank.ReadConllu(readr)\n}\n\nfunc mediumSentence() []treebank.SentenceTag {\n\treadr := strings.NewReader(med)\n\treturn treebank.ReadConllu(readr)\n}\n\n// longSentence has 44 words\nfunc longSentence() []treebank.SentenceTag {\n\treadr := strings.NewReader(long)\n\treturn treebank.ReadConllu(readr)\n}\n\nfunc allSentences() []treebank.SentenceTag {\n\tsentenceTags := treebank.ReadConllu(strings.NewReader(nnps))\n\tsentenceTags = append(sentenceTags, treebank.ReadConllu(strings.NewReader(simple))...)\n\tsentenceTags = append(sentenceTags, treebank.ReadConllu(strings.NewReader(med))...)\n\tsentenceTags = append(sentenceTags, treebank.ReadConllu(strings.NewReader(long))...)\n\treturn sentenceTags\n}\n\nfunc cvSentences() []treebank.SentenceTag {\n\treturn treebank.ReadConllu(strings.NewReader(cvconllu))\n}\n\nfunc hash(s string) string {\n\th := md5.New()\n\tio.WriteString(h, s)\n\treturn fmt.Sprintf(\"%x\", h.Sum(nil))\n}\n\nfunc cache(input string, s lingo.AnnotatedSentence) {\n\thashfilename := \"cached/\" + hash(input) + \".cached\"\n\tf, err := os.Create(hashfilename)\n\tif err != nil {\n\t\tlog.Fatal(err)\n\t}\n\tdefer f.Close()\n\n\tw := bufio.NewWriter(f)\n\tdefer w.Flush()\n\n\tencoder := gob.NewEncoder(w)\n\n\tif err := encoder.Encode(s); err != nil {\n\t\tlog.Fatal(err)\n\t}\n}\n\nfunc useCached(filename string) *lingo.Dependency {\n\tf, err := os.Open(filename)\n\tif err != nil {\n\t\tlog.Fatal(err)\n\t}\n\tdefer f.Close()\n\n\tr := bufio.NewReader(f)\n\tdecoder := gob.NewDecoder(r)\n\n\tvar sentence lingo.AnnotatedSentence\n\tif err := decoder.Decode(&sentence); err != nil {\n\t\tlog.Fatal(err)\n\t}\n\t// fixes ID and what nots\n\tsentence.Fix()\n\n\tdep := sentence.Dependency()\n\treturn dep\n}\n"
  },
  {
    "path": "dep/train.go",
    "content": "package dep\n\nimport (\n\t\"fmt\"\n\t\"os\"\n\t\"sync\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/corpus\"\n\t\"github.com/chewxy/lingo/treebank\"\n\t\"github.com/pkg/errors\"\n)\n\n// TrainerConsOpt is a construction option for trainer\ntype TrainerConsOpt func(t *Trainer)\n\n// WithTrainingModel loads a trainer with a model\nfunc WithTrainingModel(m *Model) TrainerConsOpt {\n\tf := func(t *Trainer) {\n\t\tt.Model = m\n\t}\n\treturn f\n}\n\n// WithTrainingSet creates a trainer with a training set\nfunc WithTrainingSet(st []treebank.SentenceTag) TrainerConsOpt {\n\tf := func(t *Trainer) {\n\t\tt.trainingSet = st\n\t}\n\treturn f\n}\n\n// WithCrossValidationSet creates a trainer with a cross validation set\nfunc WithCrossValidationSet(st []treebank.SentenceTag) TrainerConsOpt {\n\tf := func(t *Trainer) {\n\t\tt.crossValSet = st\n\t}\n\treturn f\n}\n\n// WithConfig sets up a *Trainer with a NNConfig\nfunc WithConfig(conf NNConfig) TrainerConsOpt {\n\tf := func(t *Trainer) {\n\t\tt.nn.NNConfig = conf\n\t\tt.nn.dict = t.corpus\n\t\tt.nn.transitions = t.ts\n\t\tt.EvalPerIter = conf.EvalPerIteration\n\t}\n\treturn f\n}\n\n// WithLemmatizer sets the lemmatizer option on the Trainer\nfunc WithLemmatizer(l lingo.Lemmatizer) TrainerConsOpt {\n\tf := func(t *Trainer) {\n\t\t// cannot pass in itself!\n\t\tif T, ok := l.(*Trainer); ok && T == t {\n\t\t\tpanic(\"Recursive definition of lemmatizer (trying to set the t.lemmatizer = T) !\")\n\t\t}\n\n\t\tt.l = l\n\t}\n\treturn f\n}\n\n// WithStemmer sets up the stemmer option on the DependencyParser\nfunc WithStemmer(s lingo.Stemmer) TrainerConsOpt {\n\tf := func(t *Trainer) {\n\t\t// cannot pass in itself\n\t\tif T, ok := s.(*Trainer); ok && T == t {\n\t\t\tpanic(\"Recursive setting of stemmer! (Trying to set t.stemmer = T)\")\n\t\t}\n\t\tt.s = s\n\t}\n\treturn f\n}\n\n// WithCluster sets the brown cluster options for the DependencyParser\nfunc WithCluster(c map[string]lingo.Cluster) TrainerConsOpt {\n\tf := func(t *Trainer) {\n\t\tt.c = c\n\t}\n\treturn f\n}\n\n// WithCorpus creates a Trainer with a corpus\nfunc WithCorpus(c *corpus.Corpus) TrainerConsOpt {\n\tf := func(t *Trainer) {\n\t\tt.corpus = c\n\t\tt.nn.dict = c\n\t}\n\treturn f\n}\n\n// WithGeneratedCorpus creates a Trainer's corpus from a list of SentenceTags. The corpus will be generated from the SentenceTags\nfunc WithGeneratedCorpus(sts ...treebank.SentenceTag) TrainerConsOpt {\n\tf := func(t *Trainer) {\n\t\tdict := corpus.GenerateCorpus(sts)\n\t\tif t.corpus == nil {\n\t\t\tt.corpus = dict\n\t\t} else {\n\t\t\tt.corpus.Merge(dict)\n\t\t}\n\n\t\tt.nn.dict = t.corpus\n\t}\n\treturn f\n}\n\n// Trainer trains a model\ntype Trainer struct {\n\ttrainingSet []treebank.SentenceTag\n\tcrossValSet []treebank.SentenceTag\n\n\tonce sync.Once\n\t*Model\n\n\t// Training configuration\n\tEvalPerIter int    // for cross validation - evaluate results every n epochs\n\tPassDirect  bool   // Pass on the costs directly to the cost channel? If false, an average will be used\n\tSaveBest    string // SaveBest is the filename that will be saved. If it's empty then the best-while-training will not be saved\n\n\t// fixer\n\tl lingo.Lemmatizer\n\ts lingo.Stemmer\n\tc map[string]lingo.Cluster\n\n\terr  chan error\n\tcost chan float64\n\tperf chan Performance\n}\n\n// NewTrainer creates a new Trainer.\nfunc NewTrainer(opts ...TrainerConsOpt) *Trainer {\n\tt := new(Trainer)\n\t// set up the default model\n\tt.Model = new(Model)\n\tt.corpus = KnownWords\n\tt.ts = transitions\n\n\t// set up the neural network\n\tt.nn = new(neuralnetwork2)\n\tt.nn.NNConfig = DefaultNNConfig\n\tt.nn.transitions = transitions\n\tt.nn.dict = KnownWords\n\n\tfor _, opt := range opts {\n\t\topt(t)\n\t}\n\treturn t\n}\n\n// Lemmatize implemnets lingo.Lemmatizer\nfunc (t *Trainer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) {\n\tif t.l == nil {\n\t\treturn nil, componentUnavailable(\"Lemmatizer\")\n\t}\n\treturn t.l.Lemmatize(a, pt)\n}\n\n// Stem implements lingo.Stemmer\nfunc (t *Trainer) Stem(a string) (string, error) {\n\tif t.s == nil {\n\t\treturn \"\", componentUnavailable(\"Stemmer\")\n\t}\n\treturn t.s.Stem(a)\n}\n\n// Clusters implements lingo.Fixer\nfunc (t *Trainer) Clusters() (map[string]lingo.Cluster, error) {\n\tif t.c == nil {\n\t\treturn nil, componentUnavailable(\"Clusters\")\n\t}\n\treturn t.c, nil\n}\n\n/* Getters */\n\n// Cost returns a channel of costs for monitoring the training. If the PassDirect field in the trainer is set to true\n// then the costs are directly returned. Otherwise the costs are averaged over the epoch.\nfunc (t *Trainer) Cost() <-chan float64 {\n\tif t.cost == nil {\n\t\tt.cost = make(chan float64)\n\t}\n\treturn t.cost\n}\n\n// Perf returns a channel of Performance for monitoring the training.\nfunc (t *Trainer) Perf() <-chan Performance {\n\tif t.perf == nil {\n\t\tt.perf = make(chan Performance)\n\t}\n\treturn t.perf\n}\n\n/* Methods */\n\n// Init initializes the DependencyParser with a corpus and a neural network config\nfunc (t *Trainer) Init() (err error) {\n\tf := func() {\n\t\terr = t.nn.init()\n\t}\n\tt.once.Do(f)\n\treturn\n}\n\n// Train trains a model.\n//\n// If a cross validation set is provided, it will automatically train with the cross validation set\nfunc (t *Trainer) Train(epochs int) error {\n\tif err := t.pretrainCheck(); err != nil {\n\t\treturn err\n\t}\n\tif len(t.crossValSet) > 0 {\n\t\treturn t.crossValidateTrain(epochs)\n\t}\n\treturn t.train(epochs)\n}\n\n// TrainWithoutCrossValidation trains a model without cross validation.\nfunc (t *Trainer) TrainWithoutCrossValidation(epochs int) error {\n\treturn t.train(epochs)\n}\n\n// train simply trains the model without having a cross validation.\nfunc (t *Trainer) train(epochs int) error {\n\n\tvar epochChan chan struct{}\n\tif t.cost != nil {\n\t\tdefer func() {\n\t\t\tclose(t.cost)\n\t\t\tt.cost = nil\n\t\t}()\n\n\t\tepochChan = t.handleCosts()\n\t\tif epochChan != nil {\n\t\t\tdefer close(epochChan)\n\t\t}\n\t}\n\n\texamples := makeExamples(t.trainingSet, t.nn.NNConfig, t.nn.dict, t.ts, t)\n\n\tfor e := 0; e < epochs; e++ {\n\t\tif err := t.nn.train(examples); err != nil {\n\t\t\treturn err\n\t\t}\n\n\t\tif epochChan != nil {\n\t\t\tepochChan <- struct{}{}\n\t\t}\n\n\t\tshuffleExamples(examples)\n\t}\n\treturn nil\n}\n\n// crossValidateTrain trains the model but also does cross validation to ensure overfitting don't happen.\nfunc (t *Trainer) crossValidateTrain(epochs int) error {\n\tif t.perf != nil {\n\t\tdefer func() {\n\t\t\tclose(t.perf)\n\t\t\tt.perf = nil\n\t\t}()\n\t}\n\n\tvar epochChan chan struct{}\n\tif t.cost != nil {\n\t\tdefer func() {\n\t\t\tclose(t.cost)\n\t\t\tt.cost = nil\n\t\t}()\n\n\t\tepochChan = t.handleCosts()\n\t\tif epochChan != nil {\n\t\t\tdefer close(epochChan)\n\t\t}\n\t}\n\texamples := makeExamples(t.trainingSet, t.nn.NNConfig, t.nn.dict, t.ts, t)\n\n\tvar best Performance\n\tfor e := 0; e < epochs; e++ {\n\t\tif err := t.nn.train(examples); err != nil {\n\t\t\treturn err\n\t\t}\n\n\t\tif t.EvalPerIter > 0 && e%t.EvalPerIter == 0 || e == epochs-1 {\n\t\t\tperf := t.crossValidate(t.crossValSet)\n\n\t\t\t// if there is a channel to report back the performance, send it down\n\t\t\tif t.perf != nil {\n\t\t\t\tperf.Iter = e\n\t\t\t\tt.perf <- perf\n\t\t\t}\n\n\t\t\tif perf.UAS > best.UAS {\n\t\t\t\tbest = perf\n\n\t\t\t\tif t.SaveBest != \"\" {\n\t\t\t\t\tf, err := os.Create(t.SaveBest)\n\t\t\t\t\tif err != nil {\n\t\t\t\t\t\terr = errors.Wrapf(err, \"Unable to open SaveBest file %q\", t.SaveBest)\n\t\t\t\t\t\treturn err\n\t\t\t\t\t}\n\n\t\t\t\t\tt.Model.SaveWriter(f)\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\tif epochChan != nil {\n\t\t\tepochChan <- struct{}{}\n\t\t}\n\n\t\tshuffleExamples(examples)\n\t}\n\treturn nil\n}\n\n// pretrainCheck checks if everything is sane\nfunc (t *Trainer) pretrainCheck() error {\n\t// check\n\tif t.nn == nil || !t.nn.initialized() {\n\t\treturn errors.Errorf(\"DependencyParser not init()'d. Perhaps you forgot to call .Init() somewhere?\")\n\t}\n\n\tif len(t.trainingSet) == 0 {\n\t\treturn errors.Errorf(\"Cannot train with no training data set\")\n\t}\n\n\treturn nil\n}\n\n// handleCosts handles the costs from the neural network in two ways:\n//\t\t1. pass: directly passes on the costs (which may come from multiple batches in an epoch)\n//\t\t2. mean: calculates the mean of the costs and passes it on into d.cost\n//\n// If d.cost is nil, it simply returns. This method should be called after a check that d.cost is not nil\nfunc (t *Trainer) handleCosts() (epochChan chan struct{}) {\n\tnncost := t.nn.costProgress()\n\n\tif t.PassDirect {\n\t\tgo func() {\n\t\t\tfor cost := range nncost {\n\t\t\t\tswitch c := cost.Data().(type) {\n\t\t\t\tcase float32:\n\t\t\t\t\tt.cost <- float64(c)\n\t\t\t\tcase float64:\n\t\t\t\t\tt.cost <- c\n\t\t\t\tdefault:\n\t\t\t\t\t// this should NEVER happen\n\t\t\t\t\tpanic(fmt.Sprintf(\"Unhandled cost type %T\", c))\n\t\t\t\t}\n\t\t\t}\n\t\t}()\n\t} else {\n\t\tepochChan = make(chan struct{})\n\n\t\t// it collects the costs until the epoch chan signals that an epoch is done. Then the cost is averaged and sent down the d.cost channel\n\t\tgo func(epochChan chan struct{}) {\n\t\t\tvar collected []float64\n\t\t\tfor {\n\t\t\t\tselect {\n\t\t\t\tcase cost := <-nncost:\n\t\t\t\t\tswitch c := cost.Data().(type) {\n\t\t\t\t\tcase float32:\n\t\t\t\t\t\tcollected = append(collected, float64(c))\n\t\t\t\t\tcase float64:\n\t\t\t\t\t\tcollected = append(collected, c)\n\t\t\t\t\tdefault:\n\t\t\t\t\t\t// this should NEVER happen\n\t\t\t\t\t\tpanic(fmt.Sprintf(\"Unhandled cost type %T\", c))\n\t\t\t\t\t}\n\t\t\t\tcase <-epochChan:\n\t\t\t\t\tvar avg float64\n\t\t\t\t\tfor _, cost := range collected {\n\t\t\t\t\t\tavg += cost\n\t\t\t\t\t}\n\n\t\t\t\t\tif len(collected) > 0 {\n\t\t\t\t\t\tavg /= float64(len(collected))\n\t\t\t\t\t}\n\n\t\t\t\t\tt.cost <- avg\n\t\t\t\t\tcollected = collected[:0]\n\t\t\t\t}\n\t\t\t}\n\t\t}(epochChan)\n\t}\n\treturn\n}\n"
  },
  {
    "path": "dep/train_test.go",
    "content": "package dep\n\nimport (\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo/corpus\"\n\n\tG \"gorgonia.org/gorgonia\"\n)\n\nfunc TestTrainerInitializations(t *testing.T) {\n\tvar d *Trainer\n\tc := corpus.New()\n\n\td = NewTrainer(WithCorpus(c))\n\tif d.corpus != c {\n\t\tt.Errorf(\"Expected Corpus to be set to %p. Got %p instead\", c, d.corpus)\n\t}\n\n\td = NewTrainer(WithConfig(DefaultNNConfig))\n\tif d.corpus != KnownWords {\n\t\tt.Error(\"Expected corpus to be set to the default KnownWords corpus\")\n\t}\n\tif d.nn == nil {\n\t\tt.Fatal(\"Expected a neural network\")\n\t}\n\tif d.nn.dict != KnownWords {\n\t\tt.Error(\"Expected neuralnetwork's dict to be set\")\n\t}\n\n\t// d2 = d.Clone()\n\t// if d2.nn != d.nn {\n\t// \tt.Error(\"Expected a neural network!\")\n\t// }\n\n\t// // init empty\n\t// d = New()\n\t// if err := d.Init(); err != nil {\n\t// \tt.Errorf(\"%+v\", err)\n\t// }\n\n\t// // init with a corpus\n\t// d = New(WithCorpus(c))\n\t// if err := d.Init(); err != nil {\n\t// \tt.Errorf(\"%+v\", err)\n\t// }\n}\n\nfunc TestTrainer_train(t *testing.T) {\n\tsts := allSentences()\n\tepochs := 10\n\n\tvar err error\n\n\ttrainer := NewTrainer(WithGeneratedCorpus(sts...), WithTrainingSet(sts))\n\tif err = trainer.Train(epochs); err == nil {\n\t\tt.Error(\"Expected an error when training an uninitialized Trainer\")\n\t}\n\n\t// with init\n\tt.Logf(\"Pass On Costs Directly\")\n\tconf := DefaultNNConfig\n\tconf.BatchSize = 90\n\ttrainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts))\n\tif err := trainer.Init(); err != nil {\n\t\tt.Errorf(\"%+v\", err)\n\t}\n\ttrainer.PassDirect = true\n\n\tvar costs []float64\n\tcost := trainer.Cost()\n\n\tgo func() {\n\t\tfor c := range cost {\n\t\t\tcosts = append(costs, c)\n\t\t\tt.Logf(\"Cost %v\", c)\n\t\t}\n\t}()\n\n\tif err = trainer.Train(epochs); err != nil {\n\t\tt.Errorf(\"Err: %v\", err)\n\t}\n\n\tif len(costs) == 0 {\n\t\tt.Errorf(\"Zero costs...\")\n\t\tgoto avgcosts\n\t}\n\n\tt.Logf(\"Costs %d\", len(costs))\n\tif len(costs) < (epochs*2)-5 { // we'll allow some tolerance\n\t\tt.Errorf(\"Expected some costs\")\n\t}\n\tif costs[0] < costs[len(costs)-1] {\n\t\tt.Errorf(\"Costs should be reducing\")\n\t}\n\navgcosts:\n\t// with init, avg costs\n\tt.Logf(\"Average Costs\")\n\tcosts = costs[:0] // reset\n\tconf = DefaultNNConfig\n\tconf.Dtype = G.Float32\n\n\ttrainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts))\n\tif err := trainer.Init(); err != nil {\n\t\tt.Errorf(\"%+v\", err)\n\t}\n\ttrainer.PassDirect = false\n\n\tcost = trainer.Cost()\n\n\tgo func() {\n\t\tfor c := range cost {\n\t\t\tcosts = append(costs, c)\n\t\t\tt.Logf(\"Cost %v\", c)\n\t\t}\n\t}()\n\tif err = trainer.Train(epochs); err != nil {\n\t\tt.Errorf(\"%v\", err)\n\t}\n\n\tif len(costs) == 0 {\n\t\tt.Fatal(\"Zero costs\")\n\t}\n\n\tt.Logf(\"Costs %d\", len(costs))\n\tif len(costs) == 0 {\n\t\tt.Errorf(\"Expected some costs\")\n\t}\n\n\tif costs[0] < costs[len(costs)-1] {\n\t\tt.Errorf(\"Costs should be reducing\")\n\t}\n}\n\nfunc TestTestTrainer_crossValidateTrain(t *testing.T) {\n\tsts := allSentences()\n\tcv := cvSentences()\n\tepochs := 10\n\n\tvar trainer *Trainer\n\tvar err error\n\n\t// uninit\n\tt.Logf(\"Uninitiated\")\n\ttrainer = NewTrainer(WithGeneratedCorpus(sts...))\n\tif err = trainer.Train(epochs); err == nil {\n\t\tt.Errorf(\"Expected an error when training with an uninitialized Trainer\")\n\t}\n\n\t// with init\n\tt.Logf(\"Pass On Costs Directly\")\n\tconf := DefaultNNConfig\n\tconf.BatchSize = 90\n\ttrainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts), WithCrossValidationSet(cv))\n\ttrainer.PassDirect = true\n\tif err := trainer.Init(); err != nil {\n\t\tt.Errorf(\"%+v\", err)\n\t}\n\n\tvar costs []float64\n\tcost := trainer.Cost()\n\tperf := trainer.Perf()\n\n\tgo func() {\n\t\tfor p := range perf {\n\t\t\tt.Logf(\"Perf \\n%v\", p)\n\t\t}\n\t}()\n\n\tgo func() {\n\t\tfor c := range cost {\n\t\t\tcosts = append(costs, c)\n\t\t\tt.Logf(\"Cost %v\", c)\n\t\t}\n\t}()\n\tif err = trainer.Train(epochs); err != nil {\n\t\tt.Error(err)\n\t}\n\n\tif len(costs) == 0 {\n\t\tt.Errorf(\"Zero costs\")\n\t\tgoto avgCosts\n\t}\n\n\tt.Logf(\"Costs %d\", len(costs))\n\tif len(costs) < (epochs*2)-5 { // we'll allow some tolerance\n\t\tt.Errorf(\"Expected some costs\")\n\t}\n\tif costs[0] < costs[len(costs)-1] {\n\t\tt.Errorf(\"Costs should be reducing\")\n\t}\n\navgCosts:\n\t// with init, avg costs, and using float32\n\tt.Logf(\"Average Costs\")\n\tcosts = costs[:0] // reset\n\tconf = DefaultNNConfig\n\tconf.Dtype = G.Float32\n\ttrainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts), WithCrossValidationSet(cv))\n\tif err := trainer.Init(); err != nil {\n\t\tt.Errorf(\"%+v\", err)\n\t}\n\ttrainer.PassDirect = false\n\n\tcost = trainer.Cost()\n\tperf = trainer.Perf()\n\n\tgo func() {\n\t\tfor p := range perf {\n\t\t\tt.Logf(\"Perf \\n%v\", p)\n\t\t}\n\t}()\n\n\tgo func() {\n\t\tfor c := range cost {\n\t\t\tcosts = append(costs, c)\n\t\t\tt.Logf(\"Cost %v\", c)\n\t\t}\n\t}()\n\ttrainer.Train(epochs)\n\n\tif len(costs) == 0 {\n\t\tt.Fatal(\"Zero costs\")\n\t}\n\n\tt.Logf(\"Costs %d\", len(costs))\n\tif len(costs) == 0 {\n\t\tt.Errorf(\"Expected some costs\")\n\t}\n\n\tif costs[0] < costs[len(costs)-1] {\n\t\tt.Errorf(\"Costs should be reducing\")\n\t}\n}\n"
  },
  {
    "path": "dep/transition.go",
    "content": "package dep\n\nimport (\n\t\"fmt\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\n// transition is a tuple of Move and label\ntype transition struct {\n\tMove\n\tlingo.DependencyType\n}\n\nvar transitions []transition\nvar MAXTRANSITION int\n\nfunc buildTransitions(labels []lingo.DependencyType) []transition {\n\tts := make([]transition, 0)\n\t// for _, l := range labels {\n\t// \tif l == lingo.NoDepType {\n\t// \t\tcontinue\n\t// \t}\n\t// \tt := transition{Left, l}\n\t// \tts = append(ts, t)\n\t// }\n\n\t// for _, l := range labels {\n\t// \tif l == lingo.NoDepType {\n\t// \t\tcontinue\n\t// \t}\n\n\t// \tt := transition{Right, l}\n\t// \tts = append(ts, t)\n\t// }\n\n\t// ts = append(ts, transition{Shift, lingo.NoDepType})\n\n\tfor _, m := range ALLMOVES {\n\t\tfor _, l := range labels {\n\t\t\tif (m == Shift && l != lingo.NoDepType) || (m != Shift && l == lingo.NoDepType) {\n\t\t\t\tcontinue\n\t\t\t}\n\t\t\tt := transition{m, l}\n\t\t\tts = append(ts, t)\n\t\t}\n\t}\n\treturn ts\n}\n\nfunc (t transition) String() string {\n\treturn fmt.Sprintf(\"(%s, %s)\", t.Move, t.DependencyType)\n}\n\nfunc lookupTransition(t transition, table []transition) int {\n\tfor i, v := range table {\n\t\tif v == t {\n\t\t\treturn i\n\t\t}\n\t}\n\tpanic(fmt.Sprintf(\"Transition %v not found\", t))\n}\n\n// this builds the default transitions\nfunc init() {\n\tlbls := make([]lingo.DependencyType, lingo.MAXDEPTYPE)\n\n\tfor i := 0; i < int(lingo.MAXDEPTYPE); i++ {\n\t\tlbls[i] = lingo.DependencyType(i)\n\t}\n\n\ttransitions = buildTransitions(lbls)\n\tMAXTRANSITION = len(transitions)\n}\n"
  },
  {
    "path": "dep/util.go",
    "content": "package dep\n\nfunc minInt(a, b int) int {\n\tif a < b {\n\t\treturn a\n\t}\n\treturn b\n}\n\nfunc maxInt(a, b int) int {\n\tif a > b {\n\t\treturn a\n\t}\n\treturn b\n}\n"
  },
  {
    "path": "dependency.go",
    "content": "package lingo\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n)\n\n// Dependency represents the dependency parse of a sentence. While AnnotatedSentence does\n// already do a job of representing the dependency parse of a sentence, *Dependency actually contains\n// meta information about the dependency parse (specifically, lefts, rights) that makes parsing a dependency a lot faster\n//\n// The fields are mostly left unexported for a good reason - a dependency parse SHOULD be static after it's been built\ntype Dependency struct {\n\tAnnotatedSentence\n\n\twordCount int\n\n\tlefts  [][]int\n\trights [][]int\n\n\tcounter int // for checking if a tree is projective\n\n\tn int\n}\n\ntype depConsOpt func(*Dependency)\n\n// FromAnnotatedSentence creates a dependency from an AnnotatedSentence.\nfunc FromAnnotatedSentence(s AnnotatedSentence) depConsOpt {\n\tfn := func(d *Dependency) {\n\t\twc := len(s)\n\t\td.AnnotatedSentence = s\n\t\td.wordCount = wc\n\t\td.n = wc - 1\n\t}\n\treturn fn\n}\n\n// AllocTree allocates the lefts and rights. Typical construction of the *Dependency doesn't allocate the trees as they're not necessary for a number of tasks.\nfunc AllocTree() depConsOpt {\n\tfn := func(d *Dependency) {\n\t\td.lefts = make([][]int, d.wordCount)\n\t\td.rights = make([][]int, d.wordCount)\n\t\tfor i := 0; i < d.wordCount; i++ {\n\t\t\td.lefts[i] = make([]int, 0)\n\t\t\td.rights[i] = make([]int, 0)\n\t\t}\n\t}\n\treturn fn\n}\n\n// NewDependency creates a new *Dependency. It takes optional construction options:\n//\t\tFromAnnotatedSentence\n//\t\tAllocTree\nfunc NewDependency(opts ...depConsOpt) *Dependency {\n\td := new(Dependency)\n\n\tfor _, opt := range opts {\n\t\topt(d)\n\t}\n\treturn d\n}\n\nfunc (d *Dependency) Sentence() AnnotatedSentence { return d.AnnotatedSentence }\nfunc (d *Dependency) Lefts() [][]int              { return d.lefts }\nfunc (d *Dependency) Rights() [][]int             { return d.rights }\nfunc (d *Dependency) WordCount() int              { return d.wordCount }\nfunc (d *Dependency) N() int                      { return d.n }\n\n// please only use these for testing\nfunc (d *Dependency) SetLefts(l [][]int)  { d.lefts = l }\nfunc (d *Dependency) SetRights(r [][]int) { d.rights = r }\n\nfunc (d *Dependency) Head(i int) int {\n\tif i < 0 || i >= d.wordCount || d.AnnotatedSentence[i].Head == nil {\n\t\treturn -1\n\t}\n\n\treturn d.AnnotatedSentence[i].HeadID()\n}\n\nfunc (d *Dependency) Label(i int) DependencyType {\n\tif i < 0 || i >= d.wordCount {\n\t\treturn NoDepType\n\t}\n\n\treturn d.AnnotatedSentence[i].DependencyType\n}\n\nfunc (d *Dependency) Annotation(i int) *Annotation {\n\tif i < 0 || i >= d.wordCount {\n\t\treturn nullAnnotation\n\t}\n\n\treturn d.AnnotatedSentence[i]\n}\n\nfunc (d *Dependency) AddArc(head, child int, label DependencyType) {\n\td.AddChild(head, child)\n\td.AddRel(child, label)\n}\n\nfunc (d *Dependency) AddChild(head, child int) {\n\theadAnn := d.AnnotatedSentence[head]\n\td.AnnotatedSentence[child].SetHead(headAnn)\n\n\tif child < head {\n\t\td.lefts[head] = append(d.lefts[head], child)\n\t} else {\n\t\td.rights[head] = append(d.rights[head], child)\n\t}\n\n\td.n++\n}\n\nfunc (d *Dependency) AddRel(child int, rel DependencyType) {\n\t// d.labels[child] = rel\n\td.AnnotatedSentence[child].DependencyType = rel\n}\n\nfunc (d *Dependency) HasSingleRoot() bool {\n\troots := 0\n\tfor _, a := range d.AnnotatedSentence {\n\t\th := a.HeadID()\n\t\tif h == 0 {\n\t\t\troots++\n\t\t}\n\t}\n\n\treturn roots == 1\n}\n\nfunc (d *Dependency) IsLegal() bool {\n\tvar heads []int\n\tfor _, a := range d.AnnotatedSentence {\n\t\th := a.HeadID()\n\t\tif h < 0 || h > d.wordCount {\n\t\t\treturn false\n\t\t}\n\t\theads = append(heads, -1)\n\t}\n\n\tfor i := 1; i < d.wordCount; i++ {\n\t\tfor k := i; k > 0; {\n\t\t\tif heads[k] >= 0 && heads[k] < 1 {\n\t\t\t\tbreak\n\t\t\t}\n\t\t\tif heads[k] == i {\n\t\t\t\treturn false\n\t\t\t}\n\t\t\theads[k] = i\n\t\t\tk = d.AnnotatedSentence[k].HeadID()\n\t\t}\n\t}\n\n\treturn true\n}\n\nfunc (d *Dependency) IsProjective() bool {\n\td.counter = -1\n\treturn d.projectiveVisit(0)\n}\n\nfunc (d *Dependency) projectiveVisit(w int) bool {\n\tfor i := 1; i < w; i++ {\n\t\tif d.AnnotatedSentence[i].HeadID() == w && d.projectiveVisit(i) == false {\n\t\t\treturn false\n\t\t}\n\t}\n\n\td.counter++\n\n\tif w != d.counter {\n\t\treturn false\n\t}\n\n\tfor i := w + 1; i < d.wordCount; i++ {\n\t\tif d.AnnotatedSentence[i].HeadID() == w && d.projectiveVisit(i) == false {\n\t\t\treturn false\n\t\t}\n\t}\n\n\treturn true\n}\n\nfunc (d *Dependency) Root() int {\n\tfor i := 1; i <= d.n; i++ {\n\t\tif d.Head(i) == 0 {\n\t\t\treturn i\n\t\t}\n\t}\n\n\treturn 0\n}\n\nfunc (d *Dependency) SprintRel() string {\n\tvar buf bytes.Buffer\n\n\tfor _, e := range d.Edges() {\n\t\tfmt.Fprintf(&buf, \"%v(%q-%d, %q-%d)\\n\", e.Rel, e.Gov.Value, e.Gov.ID, e.Dep.Value, e.Dep.ID)\n\t}\n\n\treturn buf.String()\n}\n\ntype DependencyEdge struct {\n\tGov *Annotation\n\tDep *Annotation\n\tRel DependencyType\n}\n\n// Sort interface\n\ntype edgeByID []DependencyEdge\n\nfunc (b edgeByID) Len() int           { return len(b) }\nfunc (b edgeByID) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }\nfunc (b edgeByID) Less(i, j int) bool { return b[i].Dep.ID < b[j].Dep.ID }\n"
  },
  {
    "path": "dependencyTree.go",
    "content": "package lingo\n\nimport (\n\t\"github.com/awalterschulze/gographviz\"\n\n\t\"fmt\"\n\n\t\"sync\"\n)\n\n// A DependencyTree is an alternate form of representing a dependency parse.\n// This form makes it easier to traverse the tree\ntype DependencyTree struct {\n\tParent *DependencyTree\n\n\tID   int            // the word number in a sentence\n\tType DependencyType // refers to the dependency type to the parent\n\tWord *Annotation\n\n\tChildren []*DependencyTree\n}\n\nfunc NewDependencyTree(parent *DependencyTree, ID int, ann *Annotation) *DependencyTree {\n\treturn &DependencyTree{\n\t\tParent:   parent,\n\t\tID:       ID,\n\t\tWord:     ann,\n\t\tChildren: make([]*DependencyTree, 0),\n\t}\n}\n\nfunc (d *DependencyTree) AddChild(child *DependencyTree) {\n\td.Children = append(d.Children, child)\n}\n\nfunc (d *DependencyTree) AddRel(rel DependencyType) {\n\td.Type = rel\n}\n\nfunc (d *DependencyTree) walk(c chan *DependencyTree, wg *sync.WaitGroup) {\n\tdefer wg.Done()\n\n\tfor _, child := range d.Children {\n\t\twg.Add(1)\n\t\tgo child.walk(c, wg)\n\t}\n\tc <- d // man someone should do somehting about my bad naming\n}\n\nfunc (d *DependencyTree) Dot() string {\n\t// walk graph\n\tc := make(chan *DependencyTree)\n\tout := make(chan string)\n\n\tgo dotString(c, out)\n\tvar wg sync.WaitGroup\n\twg.Add(1)\n\tgo d.walk(c, &wg)\n\n\twg.Wait()\n\tclose(c)\n\treturn <-out\n}\n\nfunc dotString(c chan *DependencyTree, out chan string) {\n\tg := gographviz.NewEscape()\n\tg.SetName(\"G\")\n\tg.SetDir(true) // it's always going to be a directed graph\n\t// g.AddNode(\"G\", \"Node_0x0\", nil) // add the root\n\n\tfor t := range c {\n\t\tid := fmt.Sprintf(\"Node_%p\", t)\n\t\tattrs := map[string]string{\n\t\t\t\"label\": fmt.Sprintf(\"%d: \\\"%s/%s\\\"\", t.ID, t.Word.Value, t.Word.POSTag),\n\t\t}\n\t\tg.AddNode(\"G\", id, attrs)\n\n\t\tif t.Parent == nil {\n\t\t\tcontinue\n\t\t}\n\n\t\tparentID := fmt.Sprintf(\"Node_%p\", t.Parent)\n\t\tedgeAttrs := map[string]string{\n\t\t\t\"label\": fmt.Sprintf(\"%v\", t.Type),\n\t\t}\n\t\tg.AddEdge(parentID, id, true, edgeAttrs)\n\t}\n\tout <- g.String()\n}\n\nfunc (d *DependencyTree) Walk(fn func(interface{})) {\n\tfor _, child := range d.Children {\n\t\tchild.Walk(fn)\n\t}\n\n\tif fn != nil {\n\t\tfn(d)\n\t}\n}\n"
  },
  {
    "path": "dependencyType.go",
    "content": "package lingo\n\nimport (\n\t\"fmt\"\n\t\"strings\"\n)\n\n// DependencyType represents the relation between two words\ntype DependencyType byte\n\nvar dependencyTypeLookup map[string]DependencyType\n\nfunc init() {\n\tdependencyTypeLookup = make(map[string]DependencyType)\n\tfor dt := NoDepType; dt < MAXDEPTYPE; dt++ {\n\t\ts := dt.String()\n\t\tdependencyTypeLookup[s] = DependencyType(dt)\n\t\tdependencyTypeLookup[strings.ToLower(s)] = DependencyType(dt)\n\t}\n}\n\nfunc (dt DependencyType) MarshalText() ([]byte, error) {\n\treturn []byte(fmt.Sprintf(\"%v\", dt)), nil\n}\n\nfunc (dt *DependencyType) UnmarshalText(text []byte) error {\n\tstr := strings.Trim(string(text), `\"`) // for JSON use, if any\n\tdeptype, _ := dependencyTypeLookup[str]\n\t*dt = deptype\n\treturn nil\n}\n\n// list of dependency type functions\n\nfunc InDepTypes(x DependencyType, set []DependencyType) bool {\n\tfor _, v := range set {\n\t\tif v == x {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\nfunc IsModifier(x DependencyType) bool      { return InDepTypes(x, Modifiers) }\nfunc IsCompound(x DependencyType) bool      { return InDepTypes(x, Compounds) }\nfunc IsDeterminerRel(x DependencyType) bool { return InDepTypes(x, DeterminerRels) }\nfunc IsMultiword(x DependencyType) bool     { return InDepTypes(x, MultiWord) }\nfunc IsQuantifier(x DependencyType) bool    { return InDepTypes(x, QuantifingMods) }\n"
  },
  {
    "path": "dependencyType_stanford.go",
    "content": "// +build stanfordrel\n\npackage lingo\n\nconst BUILD_RELSET = \"stanfordrel\"\n\n//go:generate stringer -type=DependencyType -output=dependencyType_stanford_string.go\n\n// http://nlp.stanford.edu/software/dependencies_manual.pdf\nconst (\n\tNoDepType DependencyType = iota\n\tDep\n\tRoot\n\tAux           // Auxilliary\n\tAuxPass       // passive auxiliary\n\tCop           // Copula\n\tArg           // argument\n\tAgent         // agent\n\tComp          // Complement\n\tAComp         // adjectival complement\n\tCComp         // clausal complement with internal subject\n\tXComp         // clausal complement with external subject\n\tObj           // Object\n\tDObj          // Direct Object\n\tIObj          // Indirect Object\n\tPObj          // Object of preposition\n\tSubj          // subject\n\tNSubj         // Nominal Subject\n\tNSubjPass     // passive nominal subject\n\tCSubj         // clausal subject\n\tCSubjPass     // passive clausal subject\n\tCoordination  // coordination (cannot use CC, as it's a POSTag)\n\tConj          // conjunction\n\tExpl          // Expletive\n\tMod           // modifier\n\tAMod          // adjectival modifier\n\tAppos         // Appositional modifier\n\tAdvcl         // adverbial clause modifier\n\tDet           // determiner\n\tPredet        // predeterminer\n\tPreconj       // Preconjunction\n\tVmod          // reduced, nonfinite verbal modifier\n\tMWE           // multiword expression modifier\n\tMark          // marker (word introducing an Advcl or CComp)\n\tAdvMod        // adverbial modifier\n\tNeg           // negation modifier\n\tRCMod         // relative clause modifier\n\tQuantMod      // quantifier modifier\n\tNounMod       // Noun Compound Modifier (cannot use NN because NN is defined as a POSTag)\n\tNPAdvMod      // Noun phrase adverbial modifier\n\tTMod          // temporal modifier\n\tNum           // Numeric Modifier\n\tNumberElement // element of compound number (cannot use Number because Number is defined as a LexemeType)\n\tPrep          // prepositional modifier\n\tPoss          // possession modifier\n\tPossessive    // possessive modifier ('s)\n\tPRT           // phrasal verb partical\n\tParataxis     // Parataxis (words that are next to each other)\n\tGoesWith      // GoesWith\n\tPunct         // punctuation\n\tRef           // referant\n\tSDep          // Semantic Dependent\n\tXSubj         // controlling subject\n\n\t// additional stuff not found in the original, but found in EWT\n\tCase\n\tCompound\n\tNMod\n\tDiscourse\n\tNumMod\n\tRelCl\n\tNFinCl\n\tNMod_Poss\n\tNMod_NPMod\n\tVocative\n\tList\n\tMWPrep // multiword prepositional modifier\n\tRemnant\n\tAcl\n\tNPMod\n\tMDVod\n\tDetMod\n\n\t// found in stanford nnparser SD models\n\tPComp\n\n\tMAXDEPTYPE\n)\n\nvar Modifiers = []DependencyType{AMod}\nvar Compounds = []DependencyType{Compound}\nvar DeterminerRels = []DependencyType{Det, DetMod}\nvar MultiWord = []DependencyType{MWE, MWPrep, Compound, Parataxis}\nvar QuantifingMods = []DependencyType{QuantMod, NumMod}\n"
  },
  {
    "path": "dependencyType_stanford_string.go",
    "content": "// +build stanfordrel\n\n// Code generated by \"stringer -type=DependencyType -output=dependencyType_stanford_string.go\"; DO NOT EDIT\n\npackage lingo\n\nimport \"fmt\"\n\nconst _DependencyType_name = \"NoDepTypeDepRootNSubjNSubjPassDObjIObjCSubjCSubjPassCCompXCompNumModApposNModAClACl_RelClDetDet_PreDetAModNegCaseNMod_NPModNMod_TModNMod_PossAdvClAdvModCompoundCompound_PartMWEListParataxisDiscourseExplAuxAuxPassCopMarkPunctConjCoordinationCC_PreConjMAXDEPTYPE\"\n\nvar _DependencyType_index = [...]uint16{0, 9, 12, 16, 21, 30, 34, 38, 43, 52, 57, 62, 68, 73, 77, 80, 89, 92, 102, 106, 109, 113, 123, 132, 141, 146, 152, 160, 173, 176, 180, 189, 198, 202, 205, 212, 215, 219, 224, 228, 240, 250, 260}\n\nfunc (i DependencyType) String() string {\n\tif i >= DependencyType(len(_DependencyType_index)-1) {\n\t\treturn fmt.Sprintf(\"DependencyType(%d)\", i)\n\t}\n\treturn _DependencyType_name[_DependencyType_index[i]:_DependencyType_index[i+1]]\n}\n"
  },
  {
    "path": "dependencyType_universal.go",
    "content": "// +build !stanfordrel\n\npackage lingo\n\nconst BUILD_RELSET = \"universalrel\"\n\n//go:generate stringer -type=DependencyType -output=dependencyType_universal_string.go\n\n// http://universaldependencies.github.io/docs/en/dep/all.html\nconst (\n\tNoDepType DependencyType = iota\n\tDep\n\tRoot\n\n\t// Core dependents of clausal predicates\n\n\t// nominal dependencies\n\tNSubj\n\tNSubjPass\n\tDObj\n\tIObj\n\n\t// predicate dependencies\n\tCSubj\n\tCSubjPass\n\tCComp\n\n\tXComp\n\n\t// Noun dependents\n\n\t// nominal dependencies\n\tNumMod\n\tAppos\n\tNMod\n\n\t// predicate dependencies\n\tACl\n\tACl_RelCl // RCMod in stanford deps\n\tDet\n\tDet_PreDet\n\n\t// modifier word\n\tAMod\n\tNeg\n\n\t// Case Marking, preposition, possessive\n\tCase\n\n\t//Non-Core Dependents of Clausal Predicates\n\n\t// Nominal dependencies\n\tNMod_NPMod\n\tNMod_TMod\n\tNMod_Poss\n\n\t// Predicate Dependencies\n\tAdvCl\n\n\t// Modifier Word\n\tAdvMod\n\n\t// Compounding and Unanalyzed\n\tCompound\n\tCompound_Part\n\tName // Unused in English\n\tMWE\n\tForeign  // Unused in English\n\tGoesWith // Unused in English\n\n\t// Loose Joining Relations\n\tList\n\tDislocated // Unused in English\n\tParataxis\n\tRemnant    // Unused in English\n\tReparandum // Unused in English\n\n\t// Special Clausal Dependents\n\n\t// Nominal Dependent\n\tVocative // Unused in English\n\tDiscourse\n\tExpl\n\n\t// Auxilliary\n\tAux\n\tAuxPass\n\tCop\n\n\t// Other\n\tMark\n\tPunct\n\n\t// Coordination\n\n\tConj\n\tCoordination // CC\n\tCC_PreConj\n\n\tMAXDEPTYPE\n)\n\nvar Modifiers = []DependencyType{AMod}\nvar Compounds = []DependencyType{Compound, Compound_Part}\nvar DeterminerRels = []DependencyType{Det, Det_PreDet}\nvar MultiWord = []DependencyType{MWE, Compound, Compound_Part, Parataxis}\nvar QuantifingMods = []DependencyType{NumMod}\n"
  },
  {
    "path": "dependencyType_universal_string.go",
    "content": "// +build !stanfordrel\n\n// Code generated by \"stringer -type=DependencyType -output=dependencyType_universal_string.go\"; DO NOT EDIT\n\npackage lingo\n\nimport \"fmt\"\n\nconst _DependencyType_name = \"NoDepTypeDepRootNSubjNSubjPassDObjIObjCSubjCSubjPassCCompXCompNumModApposNModAClACl_RelClDetDet_PreDetAModNegCaseNMod_NPModNMod_TModNMod_PossAdvClAdvModCompoundCompound_PartNameMWEForeignGoesWithListDislocatedParataxisRemnantReparandumVocativeDiscourseExplAuxAuxPassCopMarkPunctConjCoordinationCC_PreConjMAXDEPTYPE\"\n\nvar _DependencyType_index = [...]uint16{0, 9, 12, 16, 21, 30, 34, 38, 43, 52, 57, 62, 68, 73, 77, 80, 89, 92, 102, 106, 109, 113, 123, 132, 141, 146, 152, 160, 173, 177, 180, 187, 195, 199, 209, 218, 225, 235, 243, 252, 256, 259, 266, 269, 273, 278, 282, 294, 304, 314}\n\nfunc (i DependencyType) String() string {\n\tif i >= DependencyType(len(_DependencyType_index)-1) {\n\t\treturn fmt.Sprintf(\"DependencyType(%d)\", i)\n\t}\n\treturn _DependencyType_name[_DependencyType_index[i]:_DependencyType_index[i+1]]\n}\n"
  },
  {
    "path": "errors.go",
    "content": "package lingo\n\ntype componentUnavailable interface {\n\terror\n\tComponent() string\n}\n"
  },
  {
    "path": "go.mod",
    "content": "module github.com/chewxy/lingo\n\nrequire (\n\tgithub.com/abiosoft/ishell v2.0.0+incompatible\n\tgithub.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db // indirect\n\tgithub.com/awalterschulze/gographviz v0.0.0-20190221210632-1e9ccb565bca\n\tgithub.com/chewxy/hm v1.0.0 // indirect\n\tgithub.com/chewxy/math32 v1.0.0 // indirect\n\tgithub.com/chzyer/logex v1.1.10 // indirect\n\tgithub.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 // indirect\n\tgithub.com/davecgh/go-spew v1.1.1 // indirect\n\tgithub.com/fatih/color v1.7.0 // indirect\n\tgithub.com/flynn-archive/go-shlex v0.0.0-20150515145356-3f9db97f8568 // indirect\n\tgithub.com/gogo/protobuf v1.2.1 // indirect\n\tgithub.com/golang/protobuf v1.2.0 // indirect\n\tgithub.com/google/flatbuffers v1.10.0 // indirect\n\tgithub.com/kljensen/snowball v0.6.0\n\tgithub.com/leesper/go_rng v0.0.0-20171009123644-5344a9259b21 // indirect\n\tgithub.com/mattn/go-colorable v0.1.1 // indirect\n\tgithub.com/mattn/go-isatty v0.0.6 // indirect\n\tgithub.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4\n\tgithub.com/pkg/errors v0.8.1\n\tgithub.com/stretchr/testify v1.3.0\n\tgithub.com/xtgo/set v1.0.0\n\tgolang.org/x/exp v0.0.0-20190221220918-438050ddec5e // indirect\n\tgolang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 // indirect\n\tgolang.org/x/sys v0.0.0-20190225065934-cc5685c2db12 // indirect\n\tgolang.org/x/text v0.3.0\n\tgonum.org/v1/gonum v0.0.0-20190221132855-8ea67971a689 // indirect\n\tgonum.org/v1/netlib v0.0.0-20190221094214-0632e2ebbd2d // indirect\n\tgorgonia.org/cu v0.9.0-beta // indirect\n\tgorgonia.org/dawson v1.1.0 // indirect\n\tgorgonia.org/gorgonia v0.9.1\n\tgorgonia.org/tensor v0.9.0-beta\n\tgorgonia.org/vecf32 v0.7.0 // indirect\n\tgorgonia.org/vecf64 v0.7.0 // indirect\n)\n\ngo 1.13\n"
  },
  {
    "path": "go.sum",
    "content": "github.com/abiosoft/ishell v2.0.0+incompatible/go.mod h1:HQR9AqF2R3P4XXpMpI0NAzgHf/aS6+zVXRj14cVk9qg=\ngithub.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db/go.mod h1:rB3B4rKii8V21ydCbIzH5hZiCQE7f5E9SzUb/ZZx530=\ngithub.com/awalterschulze/gographviz v0.0.0-20190221210632-1e9ccb565bca h1:xwIXr1FpA2XBoohlpvgb11No/zbsh5Clm/98PWPcHVA=\ngithub.com/awalterschulze/gographviz v0.0.0-20190221210632-1e9ccb565bca/go.mod h1:GEV5wmg4YquNw7v1kkyoX9etIk8yVmXj+AkDHuuETHs=\ngithub.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=\ngithub.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=\ngithub.com/chewxy/math32 v1.0.0 h1:RTt2SACA7BTzvbsAKVQJLZpV6zY2MZw4bW9L2HEKkHg=\ngithub.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=\ngithub.com/chzyer/logex v1.1.10 h1:Swpa1K6QvQznwJRcfTfQJmTE72DqScAa40E+fbHEXEE=\ngithub.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=\ngithub.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 h1:q763qf9huN11kDQavWsoZXJNW3xEE4JJyHa5Q25/sd8=\ngithub.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=\ngithub.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=\ngithub.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=\ngithub.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=\ngithub.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=\ngithub.com/flynn-archive/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:rZfgFAXFS/z/lEd6LJmf9HVZ1LkgYiHx5pHhV5DR16M=\ngithub.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE=\ngithub.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=\ngithub.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=\ngithub.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=\ngithub.com/google/flatbuffers v1.10.0 h1:wHCM5N1xsJ3VwePcIpVqnmjAqRXlR44gv4hpGi+/LIw=\ngithub.com/google/flatbuffers v1.10.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=\ngithub.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=\ngithub.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=\ngithub.com/kljensen/snowball v0.6.0/go.mod h1:27N7E8fVU5H68RlUmnWwZCfxgt4POBJfENGMvNRhldw=\ngithub.com/leesper/go_rng v0.0.0-20171009123644-5344a9259b21 h1:O75p5GUdUfhJqNCMM1ntthjtJCOHVa1lzMSfh5Qsa0Y=\ngithub.com/leesper/go_rng v0.0.0-20171009123644-5344a9259b21/go.mod h1:N0SVk0uhy+E1PZ3C9ctsPRlvOPAFPkCNlcPBDkt0N3U=\ngithub.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ=\ngithub.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=\ngithub.com/mattn/go-isatty v0.0.6/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=\ngithub.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4/go.mod h1:4OwLy04Bl9Ef3GJJCoec+30X3LQs/0/m4HFRt/2LUSA=\ngithub.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=\ngithub.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=\ngithub.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=\ngithub.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=\ngithub.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=\ngithub.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=\ngithub.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=\ngithub.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=\ngithub.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=\ngolang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=\ngolang.org/x/exp v0.0.0-20190221220918-438050ddec5e h1:dVreTP5bOOWt5GFwwvgTE2iU0TkIqi2x3r0b8qGlp6k=\ngolang.org/x/exp v0.0.0-20190221220918-438050ddec5e/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=\ngolang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 h1:YUO/7uOKsKeq9UokNS62b8FYywz3ker1l1vDZRCRefw=\ngolang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=\ngolang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=\ngolang.org/x/sys v0.0.0-20190225065934-cc5685c2db12 h1:Zw7eRv6INHGfu15LVRN1vrrwusJbnfJjAZn3D1VkQIE=\ngolang.org/x/sys v0.0.0-20190225065934-cc5685c2db12/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=\ngolang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=\ngolang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=\ngolang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=\ngonum.org/v1/gonum v0.0.0-20190221132855-8ea67971a689 h1:C+7Si2b5qgXShERPqwtDu36i1o1yf1VM93A3GZIe9Fk=\ngonum.org/v1/gonum v0.0.0-20190221132855-8ea67971a689/go.mod h1:jevfED4GnIEnJrWW55YmY9DMhajHcnkqVnEXmEtMyNI=\ngonum.org/v1/netlib v0.0.0-20190221094214-0632e2ebbd2d h1:m4zHh49Vwhwq5n7qC7NRl5SqRfTyT/6PP2ASGNGRB1E=\ngonum.org/v1/netlib v0.0.0-20190221094214-0632e2ebbd2d/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=\ngorgonia.org/cu v0.9.0-beta h1:s4WQ6fiAGoErwIiXWHRB6Y9ydkx1vTTPwhWzoEZVePc=\ngorgonia.org/cu v0.9.0-beta/go.mod h1:RPEPIfaxxqUmeRe7T1T8a0NER+KxBI2McoLEXhP1Vd8=\ngorgonia.org/dawson v1.1.0 h1:o7+eJ3SKi9sheH19lpOat//tDbg0Y+M9iY/lH79VHqY=\ngorgonia.org/dawson v1.1.0/go.mod h1:Px1mcziba8YUBIDsbzGwbKJ11uIblv/zkln4jNrZ9Ws=\ngorgonia.org/gorgonia v0.9.1 h1:6blWHSDHCplQHem+pvo9dZvtsQp7l3ZiVqXk26frn9M=\ngorgonia.org/gorgonia v0.9.1/go.mod h1:qucT7YHm/2OuSHWEw/6Je/LQ5htRJNQJ1+qpB58fY8c=\ngorgonia.org/tensor v0.9.0-beta h1:16QQufB1vbJxVbIOaB5TwkerdlBWtw+AAnZHUZ531ZE=\ngorgonia.org/tensor v0.9.0-beta/go.mod h1:05Y4laKuVlj4qFoZIZW1q/9n1jZkgDBOLmKXZdBLG1w=\ngorgonia.org/vecf32 v0.7.0 h1:mkpVzSyT7/Cput5/ZxaMzzp2xbmOtqOyJlTf7AdSMe0=\ngorgonia.org/vecf32 v0.7.0/go.mod h1:iHG+kvTMqGYA0SgahfO2k62WRnxmHsqAREGbayRDzy8=\ngorgonia.org/vecf64 v0.7.0 h1:ZphOGJfnWlFfY7x8WAJAfO64IAtYqPPq9TEGem+ItZE=\ngorgonia.org/vecf64 v0.7.0/go.mod h1:1y4pmcSd+wh3phG+InwWQjYrqwyrtN9h27WLFVQfV1Q=\n"
  },
  {
    "path": "interfaces.go",
    "content": "package lingo\n\nimport (\n\t\"encoding/gob\"\n\n\t\"gorgonia.org/tensor\"\n)\n\n// Lemmatizer is anything that can lemmatize\ntype Lemmatizer interface {\n\tLemmatize(string, POSTag) ([]string, error)\n}\n\n// Stemmer is anything that can stem\ntype Stemmer interface {\n\tStem(string) (string, error)\n}\n\n// Sentencer is anything that returns an AnnotatedSentence\ntype Sentencer interface {\n\tSentence() AnnotatedSentence\n}\n\n// Corpus is the interface for the corpus.\ntype Corpus interface {\n\t// ID returns the ID of a word and whether or not it was found in the corpus\n\tId(word string) (id int, ok bool)\n\n\t// Word returns the word given the ID, and whether or not it was found in the corpus\n\tWord(id int) (word string, ok bool)\n\n\t// Add adds a word to the corpus and returns its ID. If a word was previously in the corpus, it merely updates the frequency count and returns the ID\n\tAdd(word string) int\n\n\t// Size returns the size of the corpus.\n\tSize() int\n\n\t// WordFreq returns the frequency of the word. If the word wasn't in the corpus, it returns 0.\n\tWordFreq(word string) int\n\n\t// IDFreq returns the frequency of a word given an ID. If the word isn't in the corpus it returns 0.\n\tIDFreq(id int) int\n\n\t// TotalFreq returns the total number of words ever seen by the corpus. This number includes the count of repeat words.\n\tTotalFreq() int\n\n\t// MaxWordLength returns the length of the longest known word in the corpus\n\tMaxWordLength() int\n\n\t// WordProb returns the probability of a word appearing in the corpus\n\tWordProb(word string) (float64, bool)\n\n\t// IO stuff\n\tgob.GobEncoder\n\tgob.GobDecoder\n}\n\n// WordEmbeddings is any type that is both a corpus and can return word vectors\ntype WordEmbeddings interface {\n\tCorpus\n\n\t// WordVector returns a vector of embeddings given the word\n\tWordVector(word string) (vec tensor.Tensor, err error)\n\n\t// Vector returns a vector of embeddings given the word ID\n\tVector(id int) (vec tensor.Tensor, err error)\n\n\t// Embedding returns the matrix\n\tEmbedding() tensor.Tensor\n}\n"
  },
  {
    "path": "io.go",
    "content": "package lingo\n\nimport (\n\t\"bytes\"\n\t\"encoding/json\"\n\t\"fmt\"\n\t\"strings\"\n\n\t\"github.com/pkg/errors\"\n)\n\ntype dummyAnnotation struct {\n\tPOSTag         `json:\"POSTag\"`\n\tDependencyType `json:\"Label\"`\n\n\tID    int    `json:\"ID\"`\n\tHead  int    `json:\"Head\"`\n\tValue string `json:\"Value\"`\n\tLemma string `json:\"Lemma\"`\n\tStem  string `json:\"Stem\"`\n\n\tCluster  `json:\"Cluster\"`\n\tShape    `json:\"Shape\"`\n\tWordFlag `json:\"WordFlat\"`\n}\n\n// func (a *Annotation) MarshalText() ([]byte, error) {\n// \tvar buf bytes.Buffer\n// \tif a.Head != nil {\n// \t\tfmt.Fprintf(&buf, \"%v(%q/%v-%d, %q/%v-%d)\", a.DependencyType, a.Value, a.POSTag, a.ID, a.Head.Value, a.Head.POSTag, a.Head.ID)\n// \t} else if a == rootAnnotation {\n// \t\tfmt.Fprintf(&buf, \"ROOT\")\n// \t} else {\n// \t\tfmt.Fprintf(&buf, \"%q/%v-%d\", a.Value, a.POSTag, a.ID)\n// \t}\n// \treturn buf.Bytes(), nil\n// }\n\nfunc (a *Annotation) MarshalJSON() ([]byte, error) {\n\tvar buf bytes.Buffer\n\tbuf.WriteRune('{')\n\n\tfmt.Fprintf(&buf, \"\\\"ID\\\": %d,\", a.ID)\n\tfmt.Fprintf(&buf, \"\\\"Value\\\": %q,\", a.Value)\n\tfmt.Fprintf(&buf, \"\\\"POSTag\\\": \\\"%v\\\",\", a.POSTag)\n\tfmt.Fprintf(&buf, \"\\\"Label\\\": \\\"%v\\\"\", a.DependencyType)\n\n\tif a.Head != nil {\n\t\tif a.Head == rootAnnotation {\n\t\t\tfmt.Fprintf(&buf, \", \\\"Head\\\": -1000\") // special signifier for root annotations\n\t\t} else {\n\t\t\tfmt.Fprintf(&buf, \", \\\"Head\\\": %d\", a.HeadID())\n\t\t}\n\t}\n\n\tif a.Lemma != \"\" {\n\t\tfmt.Fprintf(&buf, \", \\\"Lemma\\\": %q\", a.Lemma)\n\t}\n\n\t// Lowered is not serialized because it's a simple function call away\n\n\tif a.Stem != \"\" {\n\t\tfmt.Fprintf(&buf, \",\\\"Stem\\\": %q\", a.Stem)\n\t}\n\n\tif a.Cluster > 0 {\n\t\tfmt.Fprintf(&buf, \",\\\"Cluster\\\": %d\", a.Cluster)\n\t}\n\n\tif a.Shape != \"\" {\n\t\tfmt.Fprintf(&buf, \",\\\"Shape\\\": %q\", a.Shape)\n\t}\n\n\tif a.WordFlag > 0 {\n\t\tfmt.Fprintf(&buf, \",\\\"WordFlag\\\": %d\", a.WordFlag)\n\t}\n\tbuf.WriteRune('}')\n\treturn buf.Bytes(), nil\n}\n\nfunc (a *Annotation) UnmarshalJSON(b []byte) error {\n\tif a == nil {\n\t\t// error\n\t\treturn errors.Errorf(\"Cannot unmarshal json to a nul\")\n\t}\n\n\td := dummyAnnotation{}\n\tif err := json.Unmarshal(b, &d); err != nil {\n\t\treturn err\n\t}\n\n\ta.Value = d.Value\n\ta.POSTag = d.POSTag\n\ta.DependencyType = d.DependencyType\n\ta.ID = d.ID\n\ta.Lemma = d.Lemma\n\ta.Stem = d.Stem\n\ta.Cluster = d.Cluster\n\ta.Shape = d.Shape\n\ta.WordFlag = d.WordFlag\n\n\treturn nil\n}\n\nfunc (as AnnotatedSentence) MarshalJSON() ([]byte, error) {\n\tbuf := new(bytes.Buffer)\n\tencoder := json.NewEncoder(buf)\n\n\tbuf.WriteRune('[')\n\tfor i, a := range as {\n\t\tif err := encoder.Encode(a); err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t\tif i < len(as)-1 {\n\t\t\tbuf.WriteRune(',')\n\t\t}\n\t}\n\tbuf.WriteRune(']')\n\treturn buf.Bytes(), nil\n}\n\nfunc (as *AnnotatedSentence) UnmarshalJSON(b []byte) error {\n\tdummies := make([]dummyAnnotation, 0)\n\n\tif err := json.Unmarshal(b, &dummies); err != nil {\n\t\treturn err\n\t}\n\n\tasL := len(*as)\n\tl := len(dummies)\n\tif asL != l {\n\t\tdiff := l - asL\n\t\t(*as) = append(*as, make(AnnotatedSentence, diff)...)\n\t}\n\n\tfor i, d := range dummies {\n\t\ta := (*as)[i]\n\t\tif d.Value == \"-ROOT-\" {\n\t\t\t(*as)[i] = rootAnnotation\n\t\t\tcontinue\n\t\t}\n\n\t\tif a == nil {\n\t\t\ta = new(Annotation)\n\t\t}\n\n\t\ta.Value = d.Value\n\t\ta.POSTag = d.POSTag\n\t\ta.DependencyType = d.DependencyType\n\t\ta.ID = d.ID\n\t\ta.Lemma = d.Lemma\n\t\ta.Stem = d.Stem\n\t\ta.Cluster = d.Cluster\n\t\ta.Shape = d.Shape\n\t\ta.WordFlag = d.WordFlag\n\n\t\t(*as)[i] = a\n\t}\n\n\t// fix up head IDs\n\tfor i, d := range dummies {\n\t\ta := (*as)[i]\n\t\thead := d.Head\n\t\tif head == -1000 {\n\t\t\ta.SetHead(rootAnnotation)\n\t\t} else {\n\t\t\ta.SetHead((*as)[head])\n\t\t}\n\t}\n\n\t// TODO: fix up other things\n\tfor _, a := range *as {\n\t\ta.Lowered = strings.ToLower(a.Value)\n\t}\n\n\treturn nil\n}\n"
  },
  {
    "path": "io_test.go",
    "content": "package lingo\n\nimport (\n\t\"encoding/json\"\n\t\"testing\"\n)\n\nfunc TestAnnotationJSON(t *testing.T) {\n\ta := NewAnnotation()\n\ta.Value = \"hello\"\n\ta.POSTag = NOUN\n\ta.DependencyType = Aux\n\ta.ID = 2\n\n\tb, err := json.Marshal(a)\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n\tt.Logf(\" %s\", string(b))\n\n\tx := `{\"ID\":2,\"Value\":\"hello\",\"POSTag\":\"NOUN\",\"Label\":\"Aux\"}`\n\tc := NewAnnotation()\n\tif err = json.Unmarshal([]byte(x), c); err != nil {\n\t\tt.Error(err)\n\t}\n\n\tif c.Value != a.Value {\n\t\tt.Errorf(\"Expected Value to be %q. Got %q insteed\", a.Value, c.Value)\n\t}\n\n\tif c.POSTag != a.POSTag {\n\t\tt.Errorf(\"Expected POSTag to be %v. Got %v instead\", a.POSTag, c.POSTag)\n\t}\n\n\tif c.DependencyType != a.DependencyType {\n\t\tt.Errorf(\"Expected DependencyType to be %v. Got %v instead\", a.DependencyType, c.DependencyType)\n\t}\n}\n\nfunc TestAnnotatedSentenceJSON(t *testing.T) {\n\ta := NewAnnotation()\n\ta.Value = \"hello\"\n\ta.POSTag = NOUN\n\ta.DependencyType = Aux\n\ta.ID = 0\n\n\tb := NewAnnotation()\n\tb.Value = \"world\"\n\tb.POSTag = NOUN\n\tb.DependencyType = Aux\n\tb.ID = 1\n\tb.Head = rootAnnotation\n\n\ta.Head = b\n\n\tas := AnnotatedSentence{a, b}\n\tbs, err := json.Marshal(as)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tt.Logf(\"%s\", string(bs))\n\n\tx := `[{\"ID\":0,\"Value\":\"hello\",\"POSTag\":\"NOUN\",\"Label\":\"Aux\",\"Head\":1},{\"ID\":1,\"Value\":\"world\",\"POSTag\":\"NOUN\",\"Label\":\"Aux\",\"Head\":-1000}]`\n\n\tvar cs AnnotatedSentence\n\tif err = json.Unmarshal([]byte(x), &cs); err != nil {\n\t\tt.Error(err)\n\t}\n\tt.Logf(\"%v\", cs)\n\n\tfor i, c := range cs {\n\t\td := as[i]\n\n\t\tif c.Value != d.Value {\n\t\t\tt.Error(\"Expected Values to be the same\")\n\t\t}\n\n\t\tif c.POSTag != d.POSTag {\n\t\t\tt.Error(\"POSTag not the same\")\n\t\t}\n\n\t\tif c.DependencyType != d.DependencyType {\n\t\t\tt.Error(\"Dependency Types not the same\")\n\t\t}\n\n\t\tif c.HeadID() != d.HeadID() {\n\t\t\tt.Errorf(\"%v HeadIDs not the same. Want %v, got %v instead\", d, d.HeadID(), c.HeadID())\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "lexeme.go",
    "content": "package lingo\n\nimport (\n\t\"fmt\"\n\t\"unicode\"\n)\n\n//go:generate stringer -type=LexemeType\n\ntype LexemeType byte\n\nconst (\n\tEOF LexemeType = iota\n\tWord\n\tDisambig\n\tURI\n\tNumber\n\tDate\n\tTime\n\tPunctuation\n\tSymbol\n\tSpace\n\tSystemUse\n)\n\ntype Lexeme struct {\n\tValue      string\n\tLexemeType LexemeType\n\n\tLine int\n\tCol  int\n\tPos  int\n}\n\nfunc MakeLexeme(s string, t LexemeType) Lexeme {\n\treturn Lexeme{\n\t\tValue:      s,\n\t\tLexemeType: t,\n\t\tLine:       -1,\n\t\tCol:        -1,\n\t\tPos:        -1,\n\t}\n}\n\nfunc (l Lexeme) Fix() Lexeme {\n\tif StringIs(l.Value, unicode.IsDigit) {\n\t\tl.LexemeType = Number\n\t\treturn l\n\t}\n\treturn l\n}\n\nfunc (l Lexeme) String() string {\n\tswitch l.LexemeType {\n\tcase EOF:\n\t\treturn \"EOF\"\n\tdefault:\n\t\treturn fmt.Sprintf(\"%q/%v\", l.Value, l.LexemeType)\n\t}\n}\n\nfunc (l Lexeme) GoString() string {\n\tswitch l.LexemeType {\n\tcase EOF:\n\t\treturn fmt.Sprintf(\"EOF: %q (%d, %d, %d)\", l.Value, l.Line, l.Col, l.Pos)\n\tdefault:\n\t\treturn fmt.Sprintf(\"%s: %q (%d, %d, %d)\", l.LexemeType, l.Value, l.Line, l.Col, l.Pos)\n\t}\n}\n\nvar startLexeme = MakeLexeme(\"START_LEXEME\", SystemUse)\nvar rootLexeme = MakeLexeme(\"-ROOT-\", SystemUse)\nvar nullLexeme = MakeLexeme(\"\", SystemUse)\n\nfunc StartLexeme() Lexeme { return startLexeme }\nfunc RootLexeme() Lexeme  { return rootLexeme }\nfunc NullLexeme() Lexeme  { return nullLexeme }\n"
  },
  {
    "path": "lexemetype_string.go",
    "content": "// Code generated by \"stringer -type=LexemeType\"; DO NOT EDIT\n\npackage lingo\n\nimport \"fmt\"\n\nconst _LexemeType_name = \"EOFWordDisambigURINumberDateTimePunctuationSymbolSpaceSystemUse\"\n\nvar _LexemeType_index = [...]uint8{0, 3, 7, 15, 18, 24, 28, 32, 43, 49, 54, 63}\n\nfunc (i LexemeType) String() string {\n\tif i >= LexemeType(len(_LexemeType_index)-1) {\n\t\treturn fmt.Sprintf(\"LexemeType(%d)\", i)\n\t}\n\treturn _LexemeType_name[_LexemeType_index[i]:_LexemeType_index[i+1]]\n}\n"
  },
  {
    "path": "lexer/lexer.go",
    "content": "package lexer\n\nimport (\n\t\"bufio\"\n\t\"bytes\"\n\t\"io\"\n\t\"strings\"\n\t\"sync\"\n\n\t\"golang.org/x/text/unicode/norm\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\nconst eof rune = -1\n\ntype Lexer struct {\n\tname  string\n\tinput *bufio.Reader\n\n\tstate stateFn\n\tr     rune\n\twidth int\n\tpos   int\n\tstart int\n\tline  int\n\tcol   int\n\n\t// the string we're reading\n\tbuf *bytes.Buffer\n\n\tOutput chan lingo.Lexeme\n\tErrors chan error\n\n\tsync.Mutex\n}\n\nfunc New(name string, r io.Reader) *Lexer {\n\treturn &Lexer{\n\t\tname:  name,\n\t\tinput: bufio.NewReader(r),\n\n\t\twidth: 1,\n\t\tstart: 1, // for userfriendliness, the column index starts at 1\n\t\tcol:   1,\n\t\tpos:   1,\n\t\tbuf:   new(bytes.Buffer),\n\n\t\tOutput: make(chan lingo.Lexeme),\n\t\tErrors: make(chan error),\n\t}\n}\n\nfunc (l *Lexer) Run() {\n\tl.Lock()\n\tdefer l.Unlock()\n\tdefer close(l.Output)\n\tfor state := lexText; state != nil; {\n\t\tstate = state(l)\n\t}\n}\n\n// Reset resets the buffers. It creates a new Output and Error channel\nfunc (l *Lexer) Reset(r io.Reader) {\n\tl.Lock()\n\tl.input.Reset(r)\n\tl.buf.Reset()\n\tl.Output = make(chan lingo.Lexeme)\n\tl.Errors = make(chan error)\n\tl.Unlock()\n}\n\nfunc (l *Lexer) next() rune {\n\tvar err error\n\tl.r, l.width, err = l.input.ReadRune()\n\tif err == io.EOF {\n\t\tl.width = 1\n\t\treturn eof\n\t}\n\tl.col += l.width\n\tl.pos += l.width\n\n\treturn l.r\n}\n\n// nextUntilEOF will loop until it finds the matching string OR EOF\nfunc (l *Lexer) nextUntilEOF(s string) bool {\n\tfor r := l.next(); r != eof && strings.IndexRune(s, r) < 0; r = l.next() {\n\t\t// l.next()\n\t\tl.accept()\n\t}\n\tif l.r == eof {\n\t\treturn true\n\t}\n\treturn false\n}\n\nfunc (l *Lexer) backup() {\n\tl.input.UnreadRune()\n\tl.pos -= l.width\n\tl.col -= l.width\n}\n\nfunc (l *Lexer) peek() rune {\n\tbackup := l.r\n\tpos := l.pos\n\tcol := l.col\n\n\tr := l.next()\n\tl.backup()\n\n\tl.pos = pos\n\tl.col = col\n\tl.r = backup\n\treturn r\n}\n\nfunc (l *Lexer) lineCount() {\n\tnewLines := bytes.Count(l.buf.Bytes(), []byte(\"\\n\"))\n\n\tl.line += newLines\n\tif newLines > 0 {\n\t\tl.col = 1\n\t}\n}\n\nfunc (l *Lexer) accept() {\n\tl.buf.WriteRune(l.r)\n}\n\nfunc (l *Lexer) acceptRun(valid string) (accepted bool) {\n\tfor strings.IndexRune(valid, l.peek()) >= 0 {\n\t\tl.next()\n\t\tl.accept()\n\t\taccepted = true\n\t}\n\treturn\n}\n\nfunc (l *Lexer) acceptRunFn(fn func(rune) bool) (accepted int) {\n\tfor fn(l.peek()) {\n\t\tl.next()\n\t\tl.accept()\n\t\taccepted++\n\t}\n\treturn\n}\n\nfunc (l *Lexer) ignore() {\n\tl.start = l.pos\n\tl.buf.Reset()\n}\n\nfunc (l *Lexer) emit(t lingo.LexemeType) {\n\tnormalized := string(norm.NFC.Bytes(l.buf.Bytes()))\n\tlex := lingo.MakeLexeme(normalized, t)\n\tlex.Line = l.line\n\tlex.Col = l.start\n\tlex.Pos = l.pos - l.buf.Len()\n\n\t// TODO: sometimes the offset is wrong on leading tokens since l.pos starts at 1\n\t// if lex.Pos < 0 {\n\t// \tlex.Pos = 0\n\t// }\n\n\tl.Output <- lex\n\n\t// reset\n\tl.ignore()\n\tif l.r != 0x0 {\n\t\tl.buf.WriteRune(l.r)\n\t}\n}\n"
  },
  {
    "path": "lexer/lexer_test.go",
    "content": "package lexer\n\nimport (\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\ntype lexerTest struct {\n\tname string\n\ts    string\n\n\tlexemes []lingo.Lexeme\n}\n\nvar lexerTests = []lexerTest{\n\t// {\"empty\", \"\", []lingo.Lexeme{\n\t// \t{\"\", lingo.EOF, 0, 1, 0},\n\t// }},\n\t//\n\t// {\"spaces\", \" \\t\", []lingo.Lexeme{\n\t// \t{\"\", lingo.EOF, 0, 3, 2},\n\t// }},\n\t//\n\t// {\"newlines\", \"\\n\\r\\n\\n\", []lingo.Lexeme{\n\t// \t{\"\", lingo.EOF, 3, 5, 4},\n\t// }},\n\t//\n\t// {\"simple text\", \"hello world\", []lingo.Lexeme{\n\t// \t{\"hello\", lingo.Word, 0, 1, 0},\n\t// \t{\"world\", lingo.Word, 0, 7, 6},\n\t// \t{\"\", lingo.EOF, 0, 12, 11},\n\t// }},\n\t//\n\t// {\"simple number\", \"3.1415\", []lingo.Lexeme{\n\t// \t{\"3.1415\", lingo.Number, 0, 1, 0},\n\t// \t{\"\", lingo.EOF, 0, 12, 5},\n\t// }},\n\n\t{\"advanced numerology\", \"3.14 -1.618\", []lingo.Lexeme{\n\t\t{\"3.14\", lingo.Number, 0, 1, 0},\n\t\t{\"-1.618\", lingo.Number, 0, 6, 5},\n\t\t{\"\", lingo.EOF, 0, 11, 10},\n\t}},\n\n\t// {\"advanced numerology\", \"3.14 -1.618 6.023e23 1e-13\", []lingo.Lexeme{\n\t// \t{\"3.14\", lingo.Number, 0, 1, 0},\n\t// \t{\"-1.618\", lingo.Number, 0, 6, 5},\n\t// \t{\"6.023e23\", lingo.Number, 0, 13, 12},\n\t// \t{\"1e-13\", lingo.Number, 0, 21, 20},\n\t// \t{\"\", lingo.EOF, 0, 26, 25},\n\t// }},\n\t//\n\t// {\"esoteric numerology\", \"1/2 1 1/4\", []lingo.Lexeme{\n\t// \t{\"1/2\", lingo.Number, 0, 1, 0},\n\t// \t{\"1\", lingo.Number, 0, 5, 4},\n\t// \t{\"1/4\", lingo.Number, 0, 7, 6},\n\t// \t{\"\", lingo.EOF, 0, 10, 9},\n\t// }},\n\t//\n\t// {\"text with numbers\", \"one plus 1 don't equals 3\", []lingo.Lexeme{\n\t// \t{\"one\", lingo.Word, 0, 1, 0},\n\t// \t{\"plus\", lingo.Word, 0, 5, 4},\n\t// \t{\"1\", lingo.Number, 0, 10, 9},\n\t// \t{\"do\", lingo.Word, 0, 12, 11},\n\t// \t{\"n't\", lingo.Word, 0, 14, 13},\n\t// \t{\"equals\", lingo.Word, 0, 18, 17},\n\t// \t{\"3\", lingo.Number, 0, 24, 23},\n\t// \t{\"\", lingo.EOF, 0, 25, 24},\n\t// }},\n\t//\n\t// {\"text with numbers + punct\", \"First111!.!\", []lingo.Lexeme{\n\t// \t{\"First111\", lingo.Word, 0, 1, 0},\n\t// \t{\"!.!\", lingo.Punctuation, 0, 9, 8},\n\t// \t{\"\", lingo.EOF, 0, 10, 9},\n\t// }},\n\t//\n\t// {\"text with verb contractions\", \"You're panic'd I'll get'em I've\", []lingo.Lexeme{\n\t// \t{\"You\", lingo.Word, 0, 1, 0},\n\t// \t{\"'re\", lingo.Word, 0, 3, 2},\n\t// \t{\"panic\", lingo.Word, 0, 8, 7},\n\t// \t{\"'d\", lingo.Word, 0, 13, 12},\n\t// \t{\"I\", lingo.Word, 0, 16, 15},\n\t// \t{\"'ll\", lingo.Word, 0, 17, 16},\n\t// \t{\"get\", lingo.Word, 0, 21, 20},\n\t// \t{\"'em\", lingo.Word, 0, 24, 23},\n\t// \t{\"I\", lingo.Word, 0, 27, 26},\n\t// \t{\"'ve\", lingo.Word, 0, 30, 29},\n\t// \t{\"\", lingo.EOF, 0, 33, 32},\n\t// }},\n\t//\n\t// {\"email\", \"dont@email.this\", []lingo.Lexeme{\n\t// \t{\"dont@email.this\", lingo.Word, 0, 1},\n\t// \t{\"\", lingo.EOF, 0, 10},\n\t// }},\n\t//\n\t// {\"plain dashes should not be numbers\", \"this case - like so\", []lingo.Lexeme{\n\t// \t{\"this\", lingo.Word, 0, 1},\n\t// \t{\"case\", lingo.Word, 0, 5},\n\t// \t{\"-\", lingo.Punctuation, 0, 6},\n\t// \t{\"like\", lingo.Word, 0, 8},\n\t// \t{\"so\", lingo.Word, 0, 13},\n\t// \t{\"\", lingo.EOF, 0, 14},\n\t// }},\n\t//\n\t// {\"parens should be printed\", \"like (this)\", []lingo.Lexeme{\n\t// \t{\"like\", lingo.Word, 0, 1},\n\t// \t{\"(\", lingo.Punctuation, 0, 5},\n\t// \t{\"this\", lingo.Word, 0, 6},\n\t// \t{\")\", lingo.Punctuation, 0, 10},\n\t// \t{\"\", lingo.EOF, 0, 11},\n\t// }},\n\t//\n\t// {\"parenthesis should be considered separate\", \"USA(United States of America)\", []lingo.Lexeme{\n\t// \t{\"USA\", lingo.Word, 0, 1},\n\t// \t{\"(\", lingo.Punctuation, 0, 1},\n\t// \t{\"United\", lingo.Word, 0, 1},\n\t// \t{\"States\", lingo.Word, 0, 1},\n\t// \t{\"of\", lingo.Word, 0, 1},\n\t// \t{\"America\", lingo.Word, 0, 1},\n\t// \t{\")\", lingo.Punctuation, 0, 1},\n\t// \t{\"\", lingo.EOF, 0, 0},\n\t// }},\n\t//\n\t// {\"midstream puncuation\", \"like:this\", []lingo.Lexeme{\n\t// \t{\"like\", lingo.Word, 0, 1},\n\t// \t{\":\", lingo.Punctuation, 0, 5},\n\t// \t{\"this\", lingo.Word, 0, 6},\n\t// \t{\"\", lingo.EOF, 0, 7},\n\t// }},\n\t//\n\t// {\"midstream symbols\", \"e-meet ke$ha by e-mail $ell anti-inflammatory\", []lingo.Lexeme{\n\t// \t{\"e-meet\", lingo.Word, 0, 1},\n\t// \t{\"ke$ha\", lingo.Word, 0, 1},\n\t// \t{\"by\", lingo.Word, 0, 1},\n\t// \t{\"e-mail\", lingo.Word, 0, 1},\n\t// \t{\"$\", lingo.Symbol, 0, 1},\n\t// \t{\"ell\", lingo.Word, 0, 1},\n\t// \t{\"anti-inflammatory\", lingo.Word, 0, 1},\n\t// \t{\"\", lingo.EOF, 0, 0},\n\t// }},\n\t//\n\t// {\"abbrev\", \"USB, made in U.S.A. e.g t/away c/o\", []lingo.Lexeme{\n\t// \t{\"USB\", lingo.Word, 0, 1},\n\t// \t{\",\", lingo.Punctuation, 0, 4},\n\t// \t{\"made\", lingo.Word, 0, 6},\n\t// \t{\"in\", lingo.Word, 0, 11},\n\t// \t{\"U.S.A\", lingo.Word, 0, 14},\n\t// \t{\".\", lingo.Punctuation, 0, 19},\n\t// \t{\"e.g\", lingo.Word, 0, 0},\n\t// \t{\"t/away\", lingo.Word, 0, 0},\n\t// \t{\"c/o\", lingo.Word, 0, 0},\n\t// \t{\"\", lingo.EOF, 0, 20},\n\t// }},\n\t//\n\t// {\"date time\", \"1970/1/1 00:00 00:00:00\", []lingo.Lexeme{\n\t// \t{\"1970/1/1\", lingo.Date, 0, 1},\n\t// \t{\"00:00\", lingo.Time, 0, 1},\n\t// \t{\"00:00:00\", lingo.Time, 0, 20},\n\t// \t{\"\", lingo.EOF, 0, 20},\n\t// }},\n\t//\n\t// {\"date time with dashes\", \"31-12-1970\", []lingo.Lexeme{\n\t// \t{\"31/12/1970\", lingo.Date, 0, 1},\n\t// \t{\"\", lingo.EOF, 0, 11},\n\t// }},\n\t//\n\t// {\"URI\", \"wobsite: http://www.wobsite.something.this/is/still/a.url\", []lingo.Lexeme{\n\t// \t{\"wobsite\", lingo.Word, 0, 1},\n\t// \t{\":\", lingo.Punctuation, 0, 8},\n\t// \t{\"http://www.wobsite.something.this/is/still/a.url\", lingo.URI, 0, 10},\n\t// \t{\"\", lingo.EOF, 0, 20},\n\t// }},\n\t//\n\t// {\"proper sentence\", \"hello world.\", []lingo.Lexeme{\n\t// \t{\"hello\", lingo.Word, 0, 1},\n\t// \t{\"world\", lingo.Word, 0, 6},\n\t// \t{\".\", lingo.Punctuation, 0, 7},\n\t// \t{\"\", lingo.EOF, 0, 8},\n\t// }},\n\t//\n\t// // Naive and Cafe uses combination diacritics, while the rest are just unicode\n\t// // The lexer should normalize all the things\n\t// {\"pathological english words\", \"Façade à la Naïve Château Café\", []lingo.Lexeme{\n\t// \t{\"Façade\", lingo.Word, 0, 1},\n\t// \t{\"à\", lingo.Word, 0, 1},\n\t// \t{\"la\", lingo.Word, 0, 1},\n\t// \t{\"Naïve\", lingo.Word, 0, 1},\n\t// \t{\"Château\", lingo.Word, 0, 1},\n\t// \t{\"Café\", lingo.Word, 0, 1},\n\t// \t{\"\", lingo.EOF, 0, 0},\n\t// }},\n\t//\n\t// // just plain fucked\n\t// {\"jpf\", \"你好 العالم\", []lingo.Lexeme{\n\t// \t{\"你好\", lingo.Word, 0, 1},\n\t// \t{\"العالم\", lingo.Word, 0, 1},\n\t// \t{\"\", lingo.EOF, 0, 0},\n\t// }},\n}\n\nfunc testLexer(lts *lexerTest) []lingo.Lexeme {\n\tl := New(lts.name, strings.NewReader(lts.s))\n\tvar retVal []lingo.Lexeme\n\n\tgo l.Run()\n\tfor lex := range l.Output {\n\t\tretVal = append(retVal, lex)\n\t}\n\treturn retVal\n}\n\nfunc TestLexer(t *testing.T) {\n\tfor _, lts := range lexerTests {\n\t\tlexemes := testLexer(&lts)\n\n\t\tif len(lts.lexemes) != len(lexemes) {\n\t\t\tt.Errorf(\"Test %q: Expected %d lexemes. Got %d instead: %v\", lts.name, len(lts.lexemes), len(lexemes), lexemes)\n\t\t\tcontinue\n\t\t}\n\n\t\tfor i, lex := range lexemes {\n\t\t\tif lex.LexemeType != lts.lexemes[i].LexemeType || lex.Value != lts.lexemes[i].Value || lts.lexemes[i].Pos != lex.Pos {\n\t\t\t\tt.Errorf(\"Test %q, lexeme %d: Expected %#v. Got %#v instead\", lts.name, i, lts.lexemes[i], lex)\n\t\t\t}\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "lexer/stateFn.go",
    "content": "package lexer\n\nimport (\n\t\"unicode\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\ntype stateFn func(*Lexer) stateFn\n\nfunc lexText(l *Lexer) (fn stateFn) {\n\tfor {\n\t\tnext := l.next()\n\t\tif next == eof {\n\t\t\tbreak\n\t\t}\n\t\tif l.pos != l.start {\n\t\t\tswitch {\n\t\t\tcase unicode.IsSpace(next):\n\t\t\t\tl.backup()\n\t\t\t\tfn = lexWhitespace\n\t\t\tcase unicode.IsDigit(next):\n\n\t\t\t\t// if the position is start +1.\n\t\t\t\t// This means that the first char of the string to be lexed is a number\n\t\t\t\t// this prevents things like \"yay1111\" to be lexed as \"yay\" and \"1111\"\n\t\t\t\tif l.pos == l.start+1 {\n\t\t\t\t\tl.backup()\n\t\t\t\t\treturn lexNumber\n\t\t\t\t}\n\t\t\tcase next == ':':\n\t\t\t\t// possible URI\n\t\t\t\tif l.peek() == '/' {\n\t\t\t\t\tl.accept() // accept ':'\n\t\t\t\t\tl.next()\n\t\t\t\t\tif l.peek() == '/' {\n\t\t\t\t\t\tl.accept()\n\t\t\t\t\t\treturn lexURI\n\t\t\t\t\t}\n\t\t\t\t\t// otherwise...\n\t\t\t\t\tl.backup()\n\t\t\t\t\t// \"unaccept\". since '/' has a width of 1 we can do the following\n\t\t\t\t\tl.buf.Truncate(l.buf.Len() - 1)\n\t\t\t\t}\n\t\t\t\tfn = lexPunctuation\n\t\t\tcase unicode.IsPunct(next):\n\t\t\t\t// For things like \"u.s\" or \"i.e.\" or \"e.g.\"\n\t\t\t\tn := l.peek()\n\n\t\t\t\tswitch {\n\t\t\t\tcase next == '\\'':\n\t\t\t\t\tif unicode.IsLetter(n) {\n\t\t\t\t\t\tl.emit(lingo.Word)\n\t\t\t\t\t\treturn lexText\n\t\t\t\t\t}\n\t\t\t\tcase n == eof:\n\t\t\t\t\t// common scenario - where a punctuation ends the sentence, and this thing is unable to backup\n\t\t\t\t\tl.width = 1\n\t\t\t\t\tl.backup()\n\t\t\t\t\tl.width = 0\n\t\t\t\t\tfn = lexPunctuation\n\t\t\t\t\tgoto finishup // goto because there are other cases below\n\t\t\t\tcase unicode.IsLetter(n) && (next == '.' || next == '@' || next == '-' || next == '/'):\n\t\t\t\t\t// acceptable midstream punctuations in words are emails and abbreviations\n\t\t\t\t\tl.accept()\n\t\t\t\t\treturn lexText\n\t\t\t\tdefault:\n\t\t\t\t\t// it's definitely a punctuation\n\t\t\t\t\tl.backup()\n\t\t\t\t\tfn = lexPunctuation\n\t\t\t\t}\n\n\t\t\tcase unicode.IsSymbol(next):\n\t\t\t\t// for things like \"ke$ha\"\n\t\t\t\t// bear in mind that \"$ell\" will be split into two lexemes.\n\t\t\t\tn := l.peek()\n\t\t\t\tif unicode.IsLetter(n) {\n\t\t\t\t\tl.backup()\n\t\t\t\t\tl.accept()\n\t\t\t\t\treturn lexText\n\t\t\t\t}\n\t\t\t\t//l.backup()\n\t\t\t\tfn = lexSymbol\n\t\t\tcase next == 'n':\n\t\t\t\t// for things like \"don't\" or \"doesn't\"\n\t\t\t\tn := l.peek()\n\t\t\t\tif n == '\\'' {\n\t\t\t\t\tl.backup()\n\t\t\t\t\tl.emit(lingo.Word)\n\t\t\t\t\treturn lexPunctuation\n\t\t\t\t} else {\n\t\t\t\t\tl.accept() // accept n\n\t\t\t\t\treturn lexText\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\tfinishup:\n\t\tif fn != nil {\n\t\t\tif l.start != l.pos {\n\t\t\t\tl.emit(lingo.Word)\n\t\t\t}\n\t\t\treturn fn\n\t\t}\n\t\t// otherwise keep lexText\n\t\tl.accept()\n\t}\n\n\tif l.pos > l.start {\n\t\tl.emit(lingo.Word)\n\t}\n\n\tl.emit(lingo.EOF)\n\treturn nil\n}\n\n// lexNumber lexes numbers. It accepts runs of unicode digits.\n// Upon stopping, it checks to see if the next value is a '.'. If it is, then it's a decimal value, and continues a run\n// Upon stopping a second time, it checks for 'e' or 'E', for exponentiation - 1.2E2\nfunc lexNumber(l *Lexer) (fn stateFn) {\n\tl.acceptRunFn(unicode.IsDigit)\n\n\tnext := l.next()\n\tswitch next {\n\tcase '.':\n\t\tl.accept() // accept the dot\n\t\tl.acceptRunFn(unicode.IsDigit)\n\tcase '-', '/':\n\t\t// standardize\n\t\tl.r = '/'\n\t\tl.accept()\n\t\treturn lexDate\n\tcase ':':\n\t\tif l.pos-l.start == 3 {\n\t\t\tl.accept()\n\t\t\treturn lexTime\n\t\t} else {\n\t\t\tl.backup()\n\t\t\tl.emit(lingo.Number)\n\t\t\treturn lexPunctuation\n\t\t}\n\tdefault:\n\t\tl.backup()\n\t}\n\n\tif l.acceptRun(\"eE\") {\n\t\t// handle negative exponents\n\t\tif l.peek() == '-' {\n\t\t\tl.next()\n\t\t\tl.accept()\n\t\t\treturn lexNumber(l)\n\t\t}\n\t\tl.acceptRunFn(unicode.IsDigit)\n\t}\n\tl.backup()\n\n\tif l.buf.Len() == 1 && l.buf.Bytes()[0] == '-' {\n\t\tl.emit(lingo.Punctuation) // dash\n\t\treturn lexWhitespace\n\t}\n\tl.emit(lingo.Number)\n\treturn lexWhitespace\n}\n\nfunc lexWhitespace(l *Lexer) (fn stateFn) {\n\tl.acceptRunFn(unicode.IsSpace)\n\tl.lineCount()\n\t// l.incrementLineCount()\n\t// l.backup()\n\tl.ignore() //nothing will be emitted\n\n\tnext := l.peek()\n\tswitch {\n\tcase unicode.IsDigit(next):\n\t\treturn lexNumber\n\tcase unicode.IsPunct(next):\n\t\tif next == '-' {\n\t\t\tl.next()\n\t\t\tl.accept()\n\t\t\treturn lexNumber\n\t\t}\n\t\treturn lexPunctuation\n\tcase unicode.IsSymbol(next):\n\t\treturn lexSymbol\n\t}\n\n\treturn lexText\n}\n\nfunc lexPunctuation(l *Lexer) (fn stateFn) {\n\tnext := l.next()\n\tswitch next {\n\tcase '\\'':\n\t\tl.accept()\n\t\tn := l.peek()\n\t\tswitch n {\n\t\tcase 't', 's', 'm', 'd':\n\t\t\tl.next()\n\t\t\tl.accept() // accept 't'/'s'...\n\t\t\tl.emit(lingo.Word)\n\t\t\treturn lexWhitespace\n\t\t}\n\tcase '.':\n\t\tl.accept()\n\t\t// for cases such as \"U.S\" or \"i.e\"\n\t\tn := l.peek()\n\t\tif unicode.IsLetter(n) {\n\t\t\tl.accept() // accept .\n\t\t\tl.next()\n\t\t\tl.accept()\n\t\t\treturn lexText\n\t\t}\n\tdefault:\n\t}\n\n\taccepted := l.acceptRunFn(unicode.IsPunct) // check for any other runs of punctuations\n\tpunct := unicode.IsPunct(next)\n\tif accepted == 0 && punct {\n\t\tl.accept()\n\t}\n\tl.emit(lingo.Punctuation)\n\tif accepted == 0 && !punct && !unicode.IsSpace(next) {\n\t\treturn lexText\n\t}\n\treturn lexWhitespace\n}\n\nfunc lexSymbol(l *Lexer) (fn stateFn) {\n\tl.acceptRunFn(unicode.IsSymbol)\n\tl.acceptRunFn(unicode.IsPunct) // any symbol punctuation combination should be treated as a symbole\n\tl.emit(lingo.Symbol)\n\treturn lexWhitespace\n}\n\nfunc lexURI(l *Lexer) (fn stateFn) {\n\teof := l.nextUntilEOF(\" \")\n\tif !eof {\n\t\tl.backup()\n\t\tl.backup()\n\t\tnext := l.next()\n\t\tif unicode.IsPunct(next) {\n\t\t\tl.backup()\n\t\t\tl.emit(lingo.URI)\n\t\t\treturn lexPunctuation\n\t\t}\n\t}\n\n\tl.emit(lingo.URI)\n\treturn lexWhitespace\n}\n\nfunc lexDate(l *Lexer) (fn stateFn) {\n\tl.acceptRunFn(unicode.IsDigit)\n\tnext := l.next()\n\tif next != '/' && next != '-' {\n\t\tl.backup()\n\t\tl.emit(lingo.Number) // fractions are numbers\n\t\treturn lexWhitespace\n\t}\n\tl.r = '/' // standardize\n\tl.accept()\n\n\tl.acceptRunFn(unicode.IsDigit)\n\tl.emit(lingo.Date)\n\treturn lexWhitespace\n}\n\nfunc lexTime(l *Lexer) (fn stateFn) {\n\tl.acceptRunFn(unicode.IsDigit)\n\tnext := l.next()\n\tif next != ':' {\n\t\tl.backup()\n\t\tl.emit(lingo.Time)\n\t\treturn lexWhitespace\n\t}\n\tl.accept()\n\tl.acceptRunFn(unicode.IsDigit)\n\tl.emit(lingo.Time)\n\treturn lexWhitespace\n}\n"
  },
  {
    "path": "lingo.go",
    "content": "// package lingo provides the data structures and algorithms required for natural language processing.\npackage lingo\n"
  },
  {
    "path": "pos/allinone_test.go",
    "content": "package pos\n\nimport (\n\t\"log\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/lexer\"\n\t\"github.com/chewxy/lingo/treebank\"\n)\n\nfunc TestEverything(t *testing.T) {\n\tsentences := treebank.ReadConllu(strings.NewReader(conllu))\n\n\tsentence := \"President Bush comes on federal courts.\"\n\n\tp := New(WithCluster(clusters), WithLemmatizer(dummyLem{}), WithStemmer(dummyStemmer{}))\n\tp.Train(sentences, 200)\n\n\tl := lexer.New(sentence, strings.NewReader(sentence))\n\tp2 := p.Clone()\n\tp2.Input = l.Output\n\n\tvar correct string\n\tif lingo.BUILD_TAGSET == \"stanfordtags\" {\n\t\tcorrect = \"-ROOT-/ROOT_TAG President/NNP Bush/NNP comes/DT on/IN federal/JJ courts/NN ./FULLSTOP\"\n\t} else {\n\t\tcorrect = \"-ROOT-/ROOT_TAG President/PROPN Bush/PROPN comes/VERB on/ADP federal/ADJ courts/NOUN ./PUNCT\"\n\t}\n\n\tgo l.Run()\n\tgo p2.Run()\n\tfor a := range p2.Output {\n\n\t\t// this clearly isn't gonna be accurate, given the stubbed out Lemmatizer\n\t\tif a.String() != correct {\n\t\t\tt.Error(\"Something went wrong with the POSTagging\")\n\t\t\tlog.Printf(\"%v\", a)\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "pos/context.go",
    "content": "package pos\n\nimport (\n\t\"strconv\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\n/*\nA context is which word in the current state the POSTagger is in.\nThere are so far  5 contexts:\n\t- Previous previous word\n\t- previous word\n\t- current word\n\t- next word\n\t- next next word\n\nFor each context we have 8 features:\n\t- word (lower case)\n\t- lemma\n\t- cluster\n\t- shape\n\t- prefix (first 1)\n\t- suffix (last 3)\n\t- POSTag\n\t- wordflag\n*/\n\n//go:generate stringer -type=contextType\ntype contextType byte\n\nconst featuresPerContext = 8\nconst contexts = 5\nconst (\n\t// previous previous (prev2)\n\tprev2Word contextType = iota\n\tprev2Lemma\n\tprev2Cluster\n\tprev2Shape\n\tprev2Prefix1\n\tprev2Suffix3\n\tprev2POSTag\n\tprev2Flags\n\n\t// previous\n\tprevWord\n\tprevLemma\n\tprevCluster\n\tprevShape\n\tprevPrefix1\n\tprevSuffix3\n\tprevPOSTag\n\tprevFlags\n\n\t// ith token\n\tithWord\n\tithLemma\n\tithCluster\n\tithShape\n\tithPrefix1\n\tithSuffix3\n\tithPOSTag\n\tithFlags\n\n\t// next token\n\tnextWord\n\tnextLemma\n\tnextCluster\n\tnextShape\n\tnextPrefix1\n\tnextSuffix3\n\tnextPOSTag\n\tnextFlags\n\n\t// next next token\n\tnext2Word\n\tnext2Lemma\n\tnext2Cluster\n\tnext2Shape\n\tnext2Prefix1\n\tnext2Suffix3\n\tnext2POSTag\n\tnext2Flags\n\n\tMAXCONTEXTTYPE\n)\n\ntype contextMap [MAXCONTEXTTYPE]string\n\nfunc getContext(prev2, prev, ith, next, next2 *lingo.Annotation) (retVal contextMap) {\n\tvar listOfFeats = [contexts][featuresPerContext]string{\n\t\textractContext(prev2),\n\t\textractContext(prev),\n\t\textractContext(ith),\n\t\textractContext(next),\n\t\textractContext(next2),\n\t}\n\n\tfor i, l := range listOfFeats {\n\t\tfor j, s := range l {\n\t\t\tretVal[i*featuresPerContext+j] = s\n\t\t}\n\t}\n\n\treturn retVal\n}\n\n// type featureContext struct {\n// \tword    string\n// \tlemma   string\n// \tcluster lingo.Cluster\n// \tshape   string\n// \tprefix  string\n// \tsuffix  string\n// \tPOSTag  lingo.POSTag\n// \tflag    lingo.WordFlag\n// }\n\n// extractContext extracts the feature contexts from a given annotation\nfunc extractContext(a *lingo.Annotation) (retVal [featuresPerContext]string) {\n\tif a == nil {\n\t\treturn retVal\n\t}\n\n\tword := a.Lowered\n\n\t// we normalize all the unicode btes first\n\tasRunes := []rune(a.Value)\n\tloweredRunes := []rune(word)\n\n\tretVal[0] = word\n\tretVal[1] = a.Lemma\n\tretVal[2] = strconv.Itoa(int(a.Cluster))\n\tretVal[3] = string(a.Shape)\n\n\t// prefix and suffix\n\t// we want the characters, not the bytes\n\t// for the prefix, we'll use the un-normalized version because having that extra fidelity would be useful\n\tif len(asRunes) > 0 {\n\t\tretVal[4] = string(asRunes[0])\n\t} else {\n\t\tretVal[4] = \"\"\n\t}\n\tif len(loweredRunes) >= 3 {\n\t\tretVal[5] = string(loweredRunes[len(loweredRunes)-3 : len(loweredRunes)])\n\t} else {\n\t\tretVal[5] = \"\"\n\t}\n\tretVal[6] = a.POSTag.String()\n\tretVal[7] = a.WordFlag.String()\n\n\treturn retVal\n}\n"
  },
  {
    "path": "pos/context_test.go",
    "content": "package pos\n\nimport (\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\nvar extractContextTest = []struct {\n\tval string\n\ttag lingo.POSTag\n\n\tshape string\n\tpref  string\n\tsuff  string\n\tflag  string\n\tclust string\n}{\n\t{\"TEst\", lingo.ROOT_TAG, \"XXxx\", \"T\", \"est\", \"00000000000110\", \"1\"},\n\t{\"TEst\", lingo.X, \"XXxx\", \"T\", \"est\", \"00000000000110\", \"1\"},\n\t{\"NotInClust\", lingo.UNKNOWN_TAG, \"XxxXxXxxxx\", \"N\", \"ust\", \"00000000000110\", \"0\"},\n\t{\"\", lingo.X, \"\", \"\", \"\", \"00000101111110\", \"0\"},\n}\n\nfunc TestExtractContext(t *testing.T) {\n\n\tfor i, ects := range extractContextTest {\n\t\ta := lingo.StringToAnnotation(ects.val, dummyFix{})\n\t\ta.POSTag = ects.tag\n\n\t\tres := extractContext(a)\n\n\t\tif res[0] != strings.ToLower(ects.val) {\n\t\t\tt.Errorf(\"Test %d: Expected word feature to be %q. Got %q instead\", i, strings.ToLower(ects.val), res[0])\n\t\t}\n\n\t\tif res[2] != ects.clust {\n\t\t\tt.Errorf(\"Test %d: Expected cluster to be %q. Got %q instead\", i, ects.clust, res[2])\n\t\t}\n\n\t\tif res[3] != ects.shape {\n\t\t\tt.Errorf(\"Test %d: Expected shape to be %q. Got %q instead\", i, ects.shape, res[3])\n\t\t}\n\n\t\tif res[4] != ects.pref {\n\t\t\tt.Errorf(\"Test %d: Expected prefix to be %q. Got %q instead\", i, ects.pref, res[4])\n\t\t}\n\n\t\tif res[5] != ects.suff {\n\t\t\tt.Errorf(\"Test %d: Expected suffix to be %q. Got %q instead\", i, ects.suff, res[5])\n\t\t}\n\n\t\tif res[6] != ects.tag.String() {\n\t\t\tt.Errorf(\"Test %d: Expected postag to be %q. Got %q instead\", i, ects.tag, res[6])\n\t\t}\n\n\t\tif res[7] != ects.flag {\n\t\t\tt.Errorf(\"Test %d: Expected flag to be %q. Got %q instead\", i, ects.flag, res[7])\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "pos/contexttype_string.go",
    "content": "// generated by stringer -type=contextType; DO NOT EDIT\n\npackage pos\n\nimport \"fmt\"\n\nconst _contextType_name = \"prev2Wordprev2Lemmaprev2Clusterprev2Shapeprev2Prefix1prev2Suffix3prev2POSTagprev2FlagsprevWordprevLemmaprevClusterprevShapeprevPrefix1prevSuffix3prevPOSTagprevFlagsithWordithLemmaithClusterithShapeithPrefix1ithSuffix3ithPOSTagithFlagsnextWordnextLemmanextClusternextShapenextPrefix1nextSuffix3nextPOSTagnextFlagsnext2Wordnext2Lemmanext2Clusternext2Shapenext2Prefix1next2Suffix3next2POSTagnext2FlagsMAXCONTEXTTYPE\"\n\nvar _contextType_index = [...]uint16{0, 9, 19, 31, 41, 53, 65, 76, 86, 94, 103, 114, 123, 134, 145, 155, 164, 171, 179, 189, 197, 207, 217, 226, 234, 242, 251, 262, 271, 282, 293, 303, 312, 321, 331, 343, 353, 365, 377, 388, 398, 412}\n\nfunc (i contextType) String() string {\n\tif i >= contextType(len(_contextType_index)-1) {\n\t\treturn fmt.Sprintf(\"contextType(%d)\", i)\n\t}\n\treturn _contextType_name[_contextType_index[i]:_contextType_index[i+1]]\n}\n"
  },
  {
    "path": "pos/debug.go",
    "content": "// +build debug\n\npackage pos\n\nimport (\n\t\"log\"\n\t\"strings\"\n\t\"sync/atomic\"\n)\n\nconst BUILD_DEBUG = \"POS TAGGER: Debug Build\"\n\nvar TABCOUNT uint32 = 0\n\nvar tracking = false\n\nfunc tabcount() int {\n\treturn int(atomic.LoadUint32(&TABCOUNT))\n}\n\nfunc enterLoggingContext() {\n\tatomic.AddUint32(&TABCOUNT, 1)\n\ttc := tabcount()\n\tlog.SetPrefix(strings.Repeat(\"\\t\", tc))\n}\n\nfunc leaveLoggingContext() {\n\ttc := tabcount()\n\ttc--\n\n\tif tc < 0 {\n\t\tatomic.StoreUint32(&TABCOUNT, 0)\n\t\ttc = 0\n\t} else {\n\t\tatomic.StoreUint32(&TABCOUNT, uint32(tc))\n\t}\n\tlog.SetPrefix(strings.Repeat(\"\\t\", tc))\n}\n\nfunc logf(format string, others ...interface{}) {\n\tlog.Printf(format, others...)\n}\n\nfunc recoverFrom(format string, attrs ...interface{}) {\n\tif r := recover(); r != nil {\n\t\tlog.Printf(format, attrs...)\n\t\tpanic(r)\n\t}\n}\n"
  },
  {
    "path": "pos/errors.go",
    "content": "package pos\n\nimport \"fmt\"\n\ntype componentUnavailable string\n\nfunc (c componentUnavailable) Error() string     { return fmt.Sprintf(\"%v unavailable\", c) }\nfunc (c componentUnavailable) Component() string { return string(c) }\n"
  },
  {
    "path": "pos/features.go",
    "content": "package pos\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\ntype featureType byte\n\n//go:generate stringer -type=featureType\nconst (\n\tbias featureType = iota\n\n\tithWord_\n\tnextWord_\n\tnext2Word_\n\n\tithSuffix3_\n\tithPrefix1_\n\n\tprevPOSTag_\n\tprev2POSTag_\n\tprevSuffix3_\n\tnextSuffix3_\n\n\tithShape_\n\tithCluster_\n\tnextCluster_\n\tnext2Cluster_\n\tprevCluster_\n\tprev2Cluster_\n\n\tithFlags_\n\tnextFlags_\n\tnext2Flags_\n\tprevFlags_\n\tprev2Flags_\n\n\tprevLemma_prevPOSTag\n\tprevPOSTag_ithWord\n\tprevPOSTag_prev2POSTag\n\tprev2Lemma_prev2POSTag\n\n\tMAXFEATURETYPE\n)\n\nvar featCtxMap = map[featureType]contextType{\n\tithWord_:   ithWord,\n\tnextWord_:  nextWord,\n\tnext2Word_: next2Word,\n\n\tithSuffix3_: ithSuffix3,\n\tithPrefix1_: ithPrefix1,\n\n\tprevPOSTag_:  prevPOSTag,\n\tprev2POSTag_: prev2POSTag,\n\tprevSuffix3_: prevSuffix3,\n\tnextSuffix3_: nextSuffix3,\n\n\tithShape_:     ithShape,\n\tithCluster_:   ithCluster,\n\tnextCluster_:  nextCluster,\n\tnext2Cluster_: next2Cluster,\n\tprevCluster_:  prevCluster,\n\tprev2Cluster_: prev2Cluster,\n\n\tithFlags_:   ithFlags,\n\tnextFlags_:  nextFlags,\n\tnext2Flags_: next2Flags,\n\tprevFlags_:  prevFlags,\n\tprev2Flags_: prev2Flags,\n}\n\ntype feature interface {\n\tFeatType() featureType\n\tString() string\n}\n\ntype singleFeature struct {\n\tfeatureType\n\tvalue string\n}\n\nfunc (sf singleFeature) FeatType() featureType { return sf.featureType }\nfunc (sf singleFeature) String() string {\n\treturn fmt.Sprintf(\"singleFeature{%v, %q}\", sf.featureType, sf.value)\n}\n\ntype tupleFeature struct {\n\tfeatureType\n\tvalue1 string\n\tvalue2 string\n}\n\nfunc (tf tupleFeature) FeatType() featureType { return tf.featureType }\nfunc (tf tupleFeature) String() string {\n\treturn fmt.Sprintf(\"tupleFeature {%v, %q, %q}\", tf.featureType, tf.value1, tf.value2)\n}\n\ntype featureMap map[feature]float64\n\nfunc (fm featureMap) String() string {\n\tvar buf bytes.Buffer\n\tfor f := range fm {\n\t\tfmt.Fprintf(&buf, \"%s: 1,\\n\", f)\n\t}\n\treturn buf.String()\n}\n\nfunc (fm *featureMap) add(f feature) { (*fm)[f]++ }\n\ntype sfFeatures [prevLemma_prevPOSTag]singleFeature\ntype tfFeatures [MAXFEATURETYPE - prevLemma_prevPOSTag]tupleFeature\n\nfunc fillFromContext(c contextMap) (sf sfFeatures, tf tfFeatures) {\n\tfor i := bias; i < prevLemma_prevPOSTag; i++ {\n\t\tsf[i] = singleFeature{i, c[featCtxMap[i]]}\n\t}\n\n\tconst last = prevLemma_prevPOSTag\n\ttf[prevLemma_prevPOSTag-last] = tupleFeature{prevLemma_prevPOSTag, c[prevLemma], c[prevPOSTag]}\n\ttf[prevPOSTag_ithWord-last] = tupleFeature{prevPOSTag_ithWord, c[prevPOSTag], c[ithWord]}\n\ttf[prevPOSTag_prev2POSTag-last] = tupleFeature{prevPOSTag_prev2POSTag, c[prevPOSTag], c[prev2POSTag]}\n\ttf[prev2Lemma_prev2POSTag-last] = tupleFeature{prev2Lemma_prev2POSTag, c[prev2Lemma], c[prev2POSTag]}\n\treturn\n}\n\nfunc getFeatures(s lingo.AnnotatedSentence, i int) (sfFeatures, tfFeatures) {\n\tlength := len(s)\n\n\t// set up context defaults\n\tprev2 := lingo.NullAnnotation()\n\tprev := lingo.NullAnnotation()\n\tith := s[i]\n\tnext := lingo.NullAnnotation()\n\tnext2 := lingo.NullAnnotation()\n\n\tif i-1 >= 0 {\n\t\tprev = s[i-1]\n\t}\n\tif i-2 >= 0 {\n\t\tprev2 = s[i-2]\n\t}\n\tif i+1 < length {\n\t\tnext = s[i+1]\n\t}\n\tif i+2 < length {\n\t\tnext2 = s[i+2]\n\t}\n\n\tc := getContext(prev2, prev, ith, next, next2)\n\n\treturn fillFromContext(c)\n}\n"
  },
  {
    "path": "pos/features_test.go",
    "content": "// +build stanfordtags\n\npackage pos\n\nimport (\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestGetFeatures(t *testing.T) {\n\tassert := assert.New(t)\n\n\t// test two word sentence\n\ts2 := lingo.AnnotatedSentence{\n\t\tlingo.AnnotationFromLexTag(lingo.Lexeme{\"most\", lingo.Word, -1, -1}, lingo.RBS, dummyFix{}),\n\t\tlingo.AnnotationFromLexTag(lingo.Lexeme{\"populous\", lingo.Word, -1, -1}, lingo.X, dummyFix{}),\n\t}\n\n\tfeatMap := getFeatures(s2, 0)\n\texpectedFM := featureMap{\n\t\tsingleFeature{bias, \"\"}:                       1,\n\t\tsingleFeature{ithWord_, \"most\"}:               1,\n\t\ttupleFeature{prevLemma_prevPOSTag, \"\", \"X\"}:   1,\n\t\ttupleFeature{prev2Lemma_prev2POSTag, \"\", \"X\"}: 1,\n\t\tsingleFeature{nextWord_, \"populous\"}:          1,\n\t\tsingleFeature{next2Word_, \"\"}:                 1,\n\n\t\tsingleFeature{ithSuffix3_, \"ost\"}: 1,\n\t\tsingleFeature{ithPrefix1_, \"m\"}:   1,\n\n\t\tsingleFeature{prevPOSTag_, \"X\"}:                 1,\n\t\tsingleFeature{prev2POSTag_, \"X\"}:                1,\n\t\ttupleFeature{prevPOSTag_prev2_POSTag, \"X\", \"X\"}: 1,\n\t\ttupleFeature{prevPOSTag_ithWord, \"X\", \"most\"}:   1,\n\t\tsingleFeature{prevSuffix3_, \"\"}:                 1,\n\t\tsingleFeature{nextSuffix3_, \"ous\"}:              1,\n\n\t\tsingleFeature{ithShape_, \"xxxx\"}:  1,\n\t\tsingleFeature{ithCluster_, \"0\"}:   1,\n\t\tsingleFeature{nextCluster_, \"0\"}:  1,\n\t\tsingleFeature{next2Cluster_, \"0\"}: 1,\n\t\tsingleFeature{prevCluster_, \"0\"}:  1,\n\t\tsingleFeature{prev2Cluster_, \"0\"}: 1,\n\n\t\tsingleFeature{ithFlags_, \"01000000010110\"}:   1,\n\t\tsingleFeature{nextFlags_, \"00000000010110\"}:  1,\n\t\tsingleFeature{next2Flags_, \"00000000000000\"}: 1,\n\t\tsingleFeature{prevFlags_, \"00000000000000\"}:  1,\n\t\tsingleFeature{prev2Flags_, \"00000000000000\"}: 1,\n\t}\n\tassert.EqualValues(expectedFM, featMap, \"Want: \\n%v\\n\\nGot: \\n%v\", expectedFM, featMap)\n\n\t// test five word sentence\n\ts5 := lingo.AnnotatedSentence{\n\t\tlingo.AnnotationFromLexTag(lingo.Lexeme{\"most\", lingo.Word, -1, -1}, lingo.RBS, dummyFix{}),\n\t\tlingo.AnnotationFromLexTag(lingo.Lexeme{\"populous\", lingo.Word, -1, -1}, lingo.X, dummyFix{}),\n\t\tlingo.AnnotationFromLexTag(lingo.Lexeme{\"state\", lingo.Word, -1, -1}, lingo.X, dummyFix{}),\n\t\tlingo.AnnotationFromLexTag(lingo.Lexeme{\"in\", lingo.Word, -1, -1}, lingo.X, dummyFix{}),\n\t\tlingo.AnnotationFromLexTag(lingo.Lexeme{\"America\", lingo.Word, -1, -1}, lingo.X, dummyFix{}),\n\t}\n\n\tfeatMap = getFeatures(s5, 0) // no prev\n\n\texpectedFM = featureMap{\n\t\tsingleFeature{bias, \"\"}:                       1,\n\t\tsingleFeature{ithWord_, \"most\"}:               1,\n\t\ttupleFeature{prevLemma_prevPOSTag, \"\", \"X\"}:   1,\n\t\ttupleFeature{prev2Lemma_prev2POSTag, \"\", \"X\"}: 1,\n\t\tsingleFeature{nextWord_, \"populous\"}:          1,\n\t\tsingleFeature{next2Word_, \"state\"}:            1,\n\n\t\tsingleFeature{ithSuffix3_, \"ost\"}: 1,\n\t\tsingleFeature{ithPrefix1_, \"m\"}:   1,\n\n\t\tsingleFeature{prevPOSTag_, \"X\"}:                 1,\n\t\tsingleFeature{prev2POSTag_, \"X\"}:                1,\n\t\ttupleFeature{prevPOSTag_prev2_POSTag, \"X\", \"X\"}: 1,\n\t\ttupleFeature{prevPOSTag_ithWord, \"X\", \"most\"}:   1,\n\t\tsingleFeature{prevSuffix3_, \"\"}:                 1,\n\t\tsingleFeature{nextSuffix3_, \"ous\"}:              1,\n\n\t\tsingleFeature{ithShape_, \"xxxx\"}:  1,\n\t\tsingleFeature{ithCluster_, \"0\"}:   1,\n\t\tsingleFeature{nextCluster_, \"0\"}:  1,\n\t\tsingleFeature{next2Cluster_, \"0\"}: 1,\n\t\tsingleFeature{prevCluster_, \"0\"}:  1,\n\t\tsingleFeature{prev2Cluster_, \"0\"}: 1,\n\n\t\tsingleFeature{ithFlags_, \"01000000010110\"}:   1,\n\t\tsingleFeature{nextFlags_, \"00000000010110\"}:  1,\n\t\tsingleFeature{next2Flags_, \"00000000010110\"}: 1,\n\t\tsingleFeature{prevFlags_, \"00000000000000\"}:  1,\n\t\tsingleFeature{prev2Flags_, \"00000000000000\"}: 1,\n\t}\n\tassert.EqualValues(expectedFM, featMap, \"Want: \\n%v\\n\\nGot: \\n%v\", expectedFM, featMap)\n\n\tfeatMap = getFeatures(s5, 2) // has all the feats\n\texpectedFM = featureMap{\n\t\tsingleFeature{bias, \"\"}:                         1,\n\t\tsingleFeature{ithWord_, \"state\"}:                1,\n\t\ttupleFeature{prev2Lemma_prev2POSTag, \"\", \"RBS\"}: 1,\n\t\ttupleFeature{prevLemma_prevPOSTag, \"\", \"X\"}:     1,\n\t\tsingleFeature{nextWord_, \"in\"}:                  1,\n\t\tsingleFeature{next2Word_, \"america\"}:            1,\n\n\t\tsingleFeature{ithSuffix3_, \"ate\"}: 1,\n\t\tsingleFeature{ithPrefix1_, \"s\"}:   1,\n\n\t\tsingleFeature{prevPOSTag_, \"X\"}:                   1,\n\t\tsingleFeature{prev2POSTag_, \"RBS\"}:                1,\n\t\ttupleFeature{prevPOSTag_prev2_POSTag, \"X\", \"RBS\"}: 1,\n\t\ttupleFeature{prevPOSTag_ithWord, \"X\", \"state\"}:    1,\n\t\tsingleFeature{prevSuffix3_, \"ous\"}:                1,\n\t\tsingleFeature{nextSuffix3_, \"\"}:                   1,\n\n\t\tsingleFeature{ithShape_, \"xxxx\"}:  1,\n\t\tsingleFeature{ithCluster_, \"0\"}:   1,\n\t\tsingleFeature{nextCluster_, \"0\"}:  1,\n\t\tsingleFeature{next2Cluster_, \"0\"}: 1,\n\t\tsingleFeature{prevCluster_, \"0\"}:  1,\n\t\tsingleFeature{prev2Cluster_, \"0\"}: 1,\n\n\t\tsingleFeature{ithFlags_, \"00000000010110\"}:   1,\n\t\tsingleFeature{nextFlags_, \"01000000010110\"}:  1,\n\t\tsingleFeature{next2Flags_, \"00000010000110\"}: 1,\n\t\tsingleFeature{prevFlags_, \"00000000010110\"}:  1,\n\t\tsingleFeature{prev2Flags_, \"01000000010110\"}: 1,\n\t}\n\tassert.EqualValues(expectedFM, featMap, \"Want: \\n%v\\n\\nGot: \\n%v\", expectedFM, featMap)\n\n\tfeatMap = getFeatures(s5, 4) // no nexts\n\n\texpectedFM = featureMap{\n\t\tsingleFeature{bias, \"\"}:                       1,\n\t\tsingleFeature{ithWord_, \"america\"}:            1,\n\t\ttupleFeature{prev2Lemma_prev2POSTag, \"\", \"X\"}: 1,\n\t\ttupleFeature{prevLemma_prevPOSTag, \"\", \"X\"}:   1,\n\t\tsingleFeature{nextWord_, \"\"}:                  1,\n\t\tsingleFeature{next2Word_, \"\"}:                 1,\n\n\t\tsingleFeature{ithSuffix3_, \"ica\"}: 1,\n\t\tsingleFeature{ithPrefix1_, \"A\"}:   1,\n\n\t\tsingleFeature{prevPOSTag_, \"X\"}:                  1,\n\t\tsingleFeature{prev2POSTag_, \"X\"}:                 1,\n\t\ttupleFeature{prevPOSTag_prev2_POSTag, \"X\", \"X\"}:  1,\n\t\ttupleFeature{prevPOSTag_ithWord, \"X\", \"america\"}: 1,\n\t\tsingleFeature{prevSuffix3_, \"\"}:                  1,\n\t\tsingleFeature{nextSuffix3_, \"\"}:                  1,\n\n\t\tsingleFeature{ithShape_, \"Xxxxx\"}: 1,\n\t\tsingleFeature{ithCluster_, \"0\"}:   1,\n\t\tsingleFeature{nextCluster_, \"0\"}:  1,\n\t\tsingleFeature{next2Cluster_, \"0\"}: 1,\n\t\tsingleFeature{prevCluster_, \"0\"}:  1,\n\t\tsingleFeature{prev2Cluster_, \"0\"}: 1,\n\n\t\tsingleFeature{ithFlags_, \"00000010000110\"}:   1,\n\t\tsingleFeature{nextFlags_, \"00000000000000\"}:  1,\n\t\tsingleFeature{next2Flags_, \"00000000000000\"}: 1,\n\t\tsingleFeature{prevFlags_, \"01000000010110\"}:  1,\n\t\tsingleFeature{prev2Flags_, \"00000000010110\"}: 1,\n\t}\n\n\tassert.EqualValues(expectedFM, featMap, \"Want: \\n%v\\n\\nGot: \\n%v\", expectedFM, featMap)\n}\n"
  },
  {
    "path": "pos/featuretype_string.go",
    "content": "// generated by stringer -type=featureType; DO NOT EDIT\n\npackage pos\n\nimport \"fmt\"\n\nconst _featureType_name = \"biasithWord_prevLemma_prevPOSTagprev2Lemma_prev2POSTagnextWord_next2Word_ithSuffix3_ithPrefix1_prevPOSTag_prev2POSTag_prevPOSTag_prev2_POSTagprevPOSTag_ithWordprevSuffix3_nextSuffix3_ithShape_ithCluster_nextCluster_next2Cluster_prevCluster_prev2Cluster_ithFlags_nextFlags_next2Flags_prevFlags_prev2Flags_MAXFEATURETYPE\"\n\nvar _featureType_index = [...]uint16{0, 4, 12, 32, 54, 63, 73, 84, 95, 106, 118, 141, 159, 171, 183, 192, 203, 215, 228, 240, 253, 262, 272, 283, 293, 304, 318}\n\nfunc (i featureType) String() string {\n\tif i >= featureType(len(_featureType_index)-1) {\n\t\treturn fmt.Sprintf(\"featureType(%d)\", i)\n\t}\n\treturn _featureType_name[_featureType_index[i]:_featureType_index[i+1]]\n}\n"
  },
  {
    "path": "pos/models.go",
    "content": "package pos\n\nimport (\n\t\"bufio\"\n\t\"encoding/gob\"\n\t\"io\"\n\t\"os\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\n// Model is the model that the POS Tagger runs on.\ntype Model struct {\n\t*perceptron\n\tcachedTags map[string]lingo.POSTag\n}\n\n// Save saves the model\nfunc (m *Model) Save(filename string) error {\n\tf, err := os.Create(filename)\n\tif err != nil {\n\t\treturn err\n\t}\n\treturn m.SaveWriter(f)\n}\n\nfunc (m *Model) SaveWriter(f io.WriteCloser) error {\n\tdefer f.Close()\n\n\tw := bufio.NewWriter(f)\n\tdefer w.Flush()\n\n\tencoder := gob.NewEncoder(w)\n\n\tif err := encoder.Encode(m.perceptron); err != nil {\n\t\treturn err\n\t}\n\n\tif err := encoder.Encode(m.cachedTags); err != nil {\n\t\treturn err\n\t}\n\n\treturn nil\n\n}\n\nfunc Load(filename string) (*Model, error) {\n\tf, err := os.Open(filename)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\treturn LoadReader(f)\n}\n\nfunc LoadReader(rd io.ReadCloser) (*Model, error) {\n\tdefer rd.Close()\n\n\tr := bufio.NewReader(rd)\n\tdecoder := gob.NewDecoder(r)\n\n\tm := &Model{\n\t\tperceptron: newPerceptron(),\n\t}\n\tif err := decoder.Decode(m.perceptron); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := decoder.Decode(&m.cachedTags); err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn m, nil\n\n}\n\nfunc (p *Tagger) Load(filename string) error {\n\tm, err := Load(filename)\n\tif err != nil {\n\t\treturn err\n\t}\n\tp.Model = m\n\treturn nil\n}\n"
  },
  {
    "path": "pos/models_test.go",
    "content": "package pos\n\nimport (\n\t\"os\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo/treebank\"\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestSaveLoad(t *testing.T) {\n\tpt := New()\n\tsentences := treebank.ReadConllu(strings.NewReader(conllu))\n\n\tpt.Train(sentences, 5)\n\tpt.Save(\"test.dat\")\n\n\tpt2 := New()\n\tif err := pt2.Load(\"test.dat\"); err != nil {\n\t\tos.Remove(\"test.dat\")\n\t\tt.Fatal(err)\n\t}\n\n\tassert := assert.New(t)\n\n\tassert.Equal(pt.perceptron, pt2.perceptron, \"POSTaggers' perceptrons are different:%p %p\", pt.perceptron, pt2.perceptron)\n\tassert.Equal(pt.cachedTags, pt2.cachedTags, \"POSTaggers' cachedTags are different\")\n\n\t// cleanup\n\tos.Remove(\"test.dat\")\n}\n"
  },
  {
    "path": "pos/perceptron.go",
    "content": "package pos\n\nimport \"github.com/chewxy/lingo\"\n\ntype perceptron struct {\n\t// weights map[feature]*[lingo.MAXTAG]float64 // it's a pointer to a static array because map values are immutable, and cannot be edited\n\n\tweightsSF map[singleFeature]*[lingo.MAXTAG]float64\n\tweightsTF map[tupleFeature]*[lingo.MAXTAG]float64\n\n\ttotals map[fctuple]float64\n\tsteps  map[fctuple]float64\n\n\tinstancesSeen float64\n}\n\n// feature-class tuple is a tuple that contains a feature and a class. This makes calculation of the averaging easier\ntype fctuple struct {\n\tfeature\n\tlingo.POSTag\n}\n\nfunc newPerceptron() *perceptron {\n\treturn &perceptron{\n\t\t// weights: make(map[feature]*[lingo.MAXTAG]float64),\n\n\t\tweightsSF: make(map[singleFeature]*[lingo.MAXTAG]float64),\n\t\tweightsTF: make(map[tupleFeature]*[lingo.MAXTAG]float64),\n\n\t\ttotals: make(map[fctuple]float64),\n\t\tsteps:  make(map[fctuple]float64),\n\t}\n}\n\nfunc (p *perceptron) updateWeightsSF(f singleFeature, tag lingo.POSTag, weight, value float64) {\n\ttuple := fctuple{f, tag}\n\tp.totals[tuple] += (p.instancesSeen - p.steps[tuple]) * weight\n\tp.steps[tuple] = p.instancesSeen\n\n\tif _, ok := p.weightsSF[f]; !ok {\n\t\tp.weightsSF[f] = new([lingo.MAXTAG]float64)\n\t}\n\tp.weightsSF[f][tag] = weight + value\n}\n\nfunc (p *perceptron) updateWeightsTF(f tupleFeature, tag lingo.POSTag, weight, value float64) {\n\ttuple := fctuple{f, tag}\n\tp.totals[tuple] += (p.instancesSeen - p.steps[tuple]) * weight\n\tp.steps[tuple] = p.instancesSeen\n\n\tif _, ok := p.weightsTF[f]; !ok {\n\t\tp.weightsTF[f] = new([lingo.MAXTAG]float64)\n\t}\n\tp.weightsTF[f][tag] = weight + value\n}\n\nfunc (p *perceptron) update(guess, truth lingo.POSTag, sf sfFeatures, tf tfFeatures) {\n\tp.instancesSeen++\n\tif truth == guess {\n\t\treturn\n\t}\n\n\tfor _, f := range sf {\n\t\tvar truthValue float64\n\t\tvar guessValue float64\n\n\t\tif weights, ok := p.weightsSF[f]; ok {\n\t\t\ttruthValue = weights[truth]\n\t\t\tguessValue = weights[guess]\n\t\t}\n\n\t\tp.updateWeightsSF(f, truth, truthValue, 1)\n\t\tp.updateWeightsSF(f, guess, guessValue, -1)\n\t}\n\n\tfor _, f := range tf {\n\t\tvar truthValue float64\n\t\tvar guessValue float64\n\n\t\tif weights, ok := p.weightsTF[f]; ok {\n\t\t\ttruthValue = weights[truth]\n\t\t\tguessValue = weights[guess]\n\t\t}\n\n\t\tp.updateWeightsTF(f, truth, truthValue, 1)\n\t\tp.updateWeightsTF(f, guess, guessValue, -1)\n\t}\n}\n\nfunc (p *perceptron) predict(sf sfFeatures, tf tfFeatures) lingo.POSTag {\n\tvar scores [lingo.MAXTAG]float64\n\tfor _, f := range sf {\n\t\tif weights, ok := p.weightsSF[f]; ok {\n\t\t\tfor label, weight := range weights {\n\t\t\t\tscores[label] += weight\n\t\t\t}\n\t\t}\n\t}\n\n\tfor _, f := range tf {\n\t\tif weights, ok := p.weightsTF[f]; ok {\n\t\t\tfor label, weight := range weights {\n\t\t\t\tscores[label] += weight\n\t\t\t}\n\t\t}\n\t}\n\n\treturn maxScore(&scores)\n}\n\nfunc (p *perceptron) average() {\n\tfor f, weights := range p.weightsSF {\n\t\tfor c, weight := range weights {\n\t\t\ttuple := fctuple{f, lingo.POSTag(c)}\n\t\t\ttotal := p.totals[tuple]\n\n\t\t\ttotal += (p.instancesSeen - p.steps[tuple]) * weight\n\t\t\tavg := total / p.instancesSeen\n\n\t\t\tweights[c] = avg\n\t\t}\n\t}\n\n\tfor f, weights := range p.weightsTF {\n\t\tfor c, weight := range weights {\n\t\t\ttuple := fctuple{f, lingo.POSTag(c)}\n\t\t\ttotal := p.totals[tuple]\n\n\t\t\ttotal += (p.instancesSeen - p.steps[tuple]) * weight\n\t\t\tavg := total / p.instancesSeen\n\n\t\t\tweights[c] = avg\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "pos/perceptron_io.go",
    "content": "package pos\n\nimport (\n\t\"bytes\"\n\t\"encoding/gob\"\n)\n\n/* Feature Gob interface */\n\nfunc (sf singleFeature) GobEncode() ([]byte, error) {\n\tvar buf bytes.Buffer\n\tencoder := gob.NewEncoder(&buf)\n\n\tif err := encoder.Encode(sf.featureType); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(sf.value); err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn buf.Bytes(), nil\n}\n\nfunc (sf *singleFeature) GobDecode(buf []byte) error {\n\tb := bytes.NewBuffer(buf)\n\n\tdecoder := gob.NewDecoder(b)\n\n\tif err := decoder.Decode(&sf.featureType); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&sf.value); err != nil {\n\t\treturn err\n\t}\n\n\treturn nil\n}\n\nfunc (tf tupleFeature) GobEncode() ([]byte, error) {\n\tvar buf bytes.Buffer\n\tencoder := gob.NewEncoder(&buf)\n\n\tif err := encoder.Encode(tf.featureType); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(tf.value1); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(tf.value2); err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn buf.Bytes(), nil\n}\n\nfunc (tf *tupleFeature) GobDecode(buf []byte) error {\n\tb := bytes.NewBuffer(buf)\n\n\tdecoder := gob.NewDecoder(b)\n\n\tif err := decoder.Decode(&tf.featureType); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&tf.value1); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&tf.value2); err != nil {\n\t\treturn err\n\t}\n\n\treturn nil\n}\n\n/* fctuple Gob Interface */\nfunc (fc fctuple) GobEncode() ([]byte, error) {\n\tvar buf bytes.Buffer\n\tencoder := gob.NewEncoder(&buf)\n\n\tif err := encoder.Encode(&fc.feature); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(fc.POSTag); err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn buf.Bytes(), nil\n}\n\nfunc (fc *fctuple) GobDecode(buf []byte) error {\n\tb := bytes.NewBuffer(buf)\n\n\tdecoder := gob.NewDecoder(b)\n\tif err := decoder.Decode(&fc.feature); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&fc.POSTag); err != nil {\n\t\treturn err\n\t}\n\treturn nil\n}\n\n/* Perceptron Gob Interface */\n\nfunc (p *perceptron) GobEncode() ([]byte, error) {\n\tvar buf bytes.Buffer\n\tencoder := gob.NewEncoder(&buf)\n\n\t// if err := encoder.Encode(&p.weights); err != nil {\n\t// \treturn nil, err\n\t// }\n\n\tif err := encoder.Encode(&p.weightsSF); err != nil {\n\t\treturn nil, err\n\t}\n\tif err := encoder.Encode(&p.weightsTF); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(&p.totals); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(&p.steps); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := encoder.Encode(p.instancesSeen); err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn buf.Bytes(), nil\n}\n\nfunc (p *perceptron) GobDecode(buf []byte) error {\n\tb := bytes.NewBuffer(buf)\n\tdecoder := gob.NewDecoder(b)\n\n\t// if err := decoder.Decode(&p.weights); err != nil {\n\t// \treturn err\n\t// }\n\n\tif err := decoder.Decode(&p.weightsSF); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&p.weightsTF); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&p.totals); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&p.steps); err != nil {\n\t\treturn err\n\t}\n\n\tif err := decoder.Decode(&p.instancesSeen); err != nil {\n\t\treturn err\n\t}\n\n\treturn nil\n}\n\nfunc init() {\n\tgob.Register(singleFeature{})\n\tgob.Register(tupleFeature{})\n}\n"
  },
  {
    "path": "pos/perceptron_io_test.go",
    "content": "// +build stanfordtags\n\npackage pos\n\nimport (\n\t\"bytes\"\n\t\"encoding/gob\"\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestFeatureSerialization(t *testing.T) {\n\tvar f, f2 feature\n\tf = singleFeature{ithWord_, \"hello\"}\n\tf2 = tupleFeature{ithWord_, \"hello\", \"world\"}\n\n\tvar buf bytes.Buffer\n\tencoder := gob.NewEncoder(&buf)\n\tdecoder := gob.NewDecoder(&buf)\n\n\tif err := encoder.Encode(&f); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tif err := encoder.Encode(&f2); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tvar decodedF, decodedF2 feature\n\tif err := decoder.Decode(&decodedF); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tif err := decoder.Decode(&decodedF2); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassert.Equal(t, f, decodedF, \"feature not deserialized properly\")\n\tassert.Equal(t, f2, decodedF2, \"feature not deserialized properly\")\n}\n\nfunc TestPerceptron_Serialize(t *testing.T) {\n\tp := newPerceptron()\n\n\t// set up a dummy weight\n\tf := singleFeature{ithWord_, \"hello\"}\n\tw := new([lingo.MAXTAG]float64)\n\tw[lingo.NN] = 0.5\n\tw[lingo.VB] = 0.1\n\tp.weights[f] = w\n\n\tfc := fctuple{f, lingo.VB}\n\tp.totals[fc] = 0.1337\n\tp.steps[fc] = 0.65535\n\n\tp.instancesSeen = 1022\n\n\tvar buf bytes.Buffer\n\tencoder := gob.NewEncoder(&buf)\n\tdecoder := gob.NewDecoder(&buf)\n\n\t// encode\n\tif err := encoder.Encode(p); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\t// decode\n\tp2 := newPerceptron()\n\tif err := decoder.Decode(p2); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassert := assert.New(t)\n\n\tassert.Equal(p.weights, p2.weights, \"The weights have not been deserialized properly\")\n\tassert.Equal(p.totals, p2.totals, \"Totals have not been deserialized properly\")\n\tassert.Equal(p.steps, p2.steps, \"Steps have not been deserialized properly\")\n\tassert.Equal(p.instancesSeen, p2.instancesSeen, \"InstancesSeen not deserialized properly\")\n}\n"
  },
  {
    "path": "pos/postagger.go",
    "content": "package pos\n\nimport (\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/chewxy/lingo/corpus\"\n\t\"github.com/chewxy/lingo/treebank\"\n)\n\n// Tagger is the object that tags an incoming channel of lexemes,\n// and outputs a channel of AnnotatedSentence. Each of the Annotation\n// are tagged with the POSTag\n//\n// The core of the Tagger is the perceptron (unexported).\n//\n// A large percentage of how this POS Tagger works is inspired by Mathhew Honnibal's work in SpaCy\ntype Tagger struct {\n\t*Model\n\n\tInput    chan lingo.Lexeme\n\tOutput   chan lingo.AnnotatedSentence\n\tprogress chan Progress\n\n\tsentences chan lingo.AnnotatedSentence\n\n\tlingo.Lemmatizer\n\tlingo.Stemmer\n\tcorpus   *corpus.Corpus\n\tclusters map[string]lingo.Cluster // this map is safe for concurrent access because it's readonly\n}\n\n// ConsOpt is a construction option for a Tagger\ntype ConsOpt func(*Tagger)\n\n// WithCorpus creates a *Tagger with an existing Corpus\nfunc WithCorpus(c *corpus.Corpus) ConsOpt {\n\tfn := func(p *Tagger) {\n\t\tp.corpus = c\n\t}\n\treturn fn\n}\n\n// WithLemmatizer creates a *Tagger with a lemmatizer.\n// If no lemmatizer is passed into the POSTagger, then the lemmatization process will be skipped, and the POSTagger will be less accurate\nfunc WithLemmatizer(l lingo.Lemmatizer) ConsOpt {\n\tfn := func(p *Tagger) {\n\t\tp.Lemmatizer = l\n\t}\n\treturn fn\n}\n\n// WithStemmer creates a *Tagger with a stemmer.\n// If no stemmer is passed in, then the stemming will be skipped, and the POSTagger will be less accurate\nfunc WithStemmer(s lingo.Stemmer) ConsOpt {\n\tfn := func(p *Tagger) {\n\t\tp.Stemmer = s\n\t}\n\treturn fn\n}\n\n// WithCluster creates a *Tagger with a brown cluster corpus (a map of strings to the brown clusters).\n// If no brown cluster corpus was passed in, the cluster won't be set, and the POSTagger will be less accurate\nfunc WithCluster(c map[string]lingo.Cluster) ConsOpt {\n\tfn := func(p *Tagger) {\n\t\tp.clusters = c\n\t}\n\treturn fn\n}\n\n// WithModel creates a *Tagger with the specified model\nfunc WithModel(m *Model) ConsOpt {\n\tfn := func(p *Tagger) {\n\t\tp.Model = m\n\t}\n\treturn fn\n}\n\n// New creates a new *Tagger\nfunc New(opts ...ConsOpt) *Tagger {\n\tp := &Tagger{\n\t\tOutput: make(chan lingo.AnnotatedSentence),\n\n\t\tsentences: make(chan lingo.AnnotatedSentence),\n\t}\n\n\tfor _, opt := range opts {\n\t\topt(p)\n\t}\n\n\tif p.Model == nil {\n\t\tp.Model = &Model{perceptron: newPerceptron()}\n\t\tp.cachedTags = make(map[string]lingo.POSTag)\n\t}\n\n\treturn p\n}\n\n// Clone() makes a copy of a POSTagger\nfunc (p *Tagger) Clone() *Tagger {\n\treturn &Tagger{\n\t\tModel:  p.Model,\n\t\tcorpus: p.corpus,\n\n\t\tOutput: make(chan lingo.AnnotatedSentence),\n\n\t\tsentences: make(chan lingo.AnnotatedSentence),\n\n\t\tLemmatizer: p.Lemmatizer,\n\t\tStemmer:    p.Stemmer,\n\t\tclusters:   p.clusters,\n\t}\n}\n\n// Run is used to tag a sentence. Lexemes arrive from the lexer in a channel (*Tagger.Input), and an annotated sentence is sent down the Output channel\nfunc (p *Tagger) Run() {\n\tdefer close(p.Output)\n\n\tgo p.getSentences()\n\n\tfor s := range p.sentences {\n\t\tlength := len(s)\n\t\tif length == 0 {\n\t\t\tcontinue\n\t\t}\n\t\tfor i, a := range s {\n\t\t\ttag, ok := p.shortcut(a.Lexeme)\n\t\t\tif !ok {\n\t\t\t\tsf, tf := getFeatures(s, i)\n\t\t\t\ttag = p.perceptron.predict(sf, tf)\n\t\t\t}\n\n\t\t\tp.setTag(a, tag)\n\t\t}\n\t\tp.Output <- s\n\t}\n}\n\n// Lemmatize implements the lingo.Lemmatize interface. It however, defers the actual doing of the job to the Lemmatizer.\nfunc (p *Tagger) Lemmatize(a string, pt lingo.POSTag) ([]string, error) {\n\tif p.Lemmatizer == nil {\n\t\treturn nil, componentUnavailable(\"lemmatizer\")\n\t}\n\treturn p.Lemmatizer.Lemmatize(a, pt)\n}\n\n// Stem implements the lingo.Stemmer interface. It however, defers the actual stemming to the stemmer passed in.\nfunc (p *Tagger) Stem(a string) (string, error) {\n\tif p.Stemmer == nil {\n\t\treturn \"\", componentUnavailable(\"stemmer\")\n\t}\n\treturn p.Stemmer.Stem(a)\n}\n\n// Clusters implements the lingo.AnnotationFixer interface.\nfunc (p *Tagger) Clusters() (map[string]lingo.Cluster, error) {\n\tif p.clusters == nil {\n\t\treturn nil, componentUnavailable(\"clusters\")\n\t}\n\treturn p.clusters, nil\n}\n\n// Progress creates and returns a channel of progress. By default the progress channel isn't created, and no progress info is sent\nfunc (p *Tagger) Progress() <-chan Progress {\n\tif p.progress == nil {\n\t\tp.progress = make(chan Progress)\n\t}\n\treturn p.progress\n}\n\n// Train trains a POSTagger, given a bunch of SentenceTags\nfunc (p *Tagger) Train(sentences []treebank.SentenceTag, iterations int) {\n\tif p.progress != nil {\n\t\tdefer func() {\n\t\t\tclose(p.progress)\n\t\t\tp.progress = nil\n\t\t}()\n\t}\n\n\tp.fillCache(sentences)\n\n\t// Somehow sentenceTag.AnnotatedSentence() is memory leaky.\n\t// As a result, the more training iterations there is, the more memory is used and not released\n\t// hence the cache is necessary.\n\tcache := make(map[string]lingo.AnnotatedSentence)\n\tfor iter := 0; iter < iterations; iter++ {\n\t\tc := 0\n\t\tn := 0\n\t\tshortcutted := 0\n\n\t\tvar s lingo.AnnotatedSentence\n\t\tfor _, sentenceTag := range sentences {\n\t\t\ttags := []lingo.POSTag{lingo.ROOT_TAG}\n\t\t\ttags = append(tags, sentenceTag.Tags...)\n\n\t\t\tvar ok bool\n\t\t\tif s, ok = cache[sentenceTag.String()]; !ok {\n\t\t\t\ts = sentenceTag.AnnotatedSentence(p) // the fixer is used to extract cluster information, etc into the *Annotation\n\t\t\t\tcache[sentenceTag.String()] = s\n\t\t\t}\n\n\t\t\tlength := len(s)\n\t\t\tif length == 0 {\n\t\t\t\tcontinue\n\t\t\t}\n\n\t\t\tfor _, a := range s {\n\t\t\t\tif a == lingo.RootAnnotation() {\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\t\t\t\ta.POSTag = lingo.X\n\t\t\t}\n\n\t\t\tfor i, a := range s {\n\t\t\t\t// processing\n\t\t\t\ttruth := tags[i]\n\n\t\t\t\tguess, ok := p.shortcut(a.Lexeme)\n\t\t\t\tif !ok {\n\t\t\t\t\tsf, tf := getFeatures(s, i)\n\t\t\t\t\tguess = p.perceptron.predict(sf, tf)\n\t\t\t\t\tp.perceptron.update(guess, truth, sf, tf)\n\t\t\t\t} else {\n\t\t\t\t\tshortcutted++\n\t\t\t\t}\n\t\t\t\tp.setTag(a, guess)\n\n\t\t\t\tif guess == truth {\n\t\t\t\t\tc++\n\t\t\t\t}\n\t\t\t\tn++\n\t\t\t}\n\t\t}\n\n\t\tif iter%150 == 0 {\n\t\t\tp.perceptron.average()\n\t\t\tlogf(\"Averaged perceptron\")\n\t\t}\n\n\t\tif p.progress != nil {\n\t\t\tp.progress <- Progress{Iter: iter, Correct: c, Count: n, ShortCutted: shortcutted}\n\t\t}\n\n\t\ttreebank.ShuffleSentenceTag(sentences)\n\t}\n\tp.perceptron.average()\n}\n\n// LoadShortcuts allows for domain specific things to be mapped into the tagger.\nfunc (p *Tagger) LoadShortcuts(shortcuts map[string]lingo.POSTag) {\n\tfor shortcut, tags := range shortcuts {\n\t\tp.cachedTags[shortcut] = tags\n\t}\n}\n\nfunc (p *Tagger) fillCache(sentences []treebank.SentenceTag) {\n\tlogf(\"Filling Cache with %d sentences\", len(sentences))\n\n\tvar counter = make(map[string]map[lingo.POSTag]int)\n\n\tfor _, sentenceTag := range sentences {\n\t\ts := sentenceTag.Sentence\n\t\ttags := sentenceTag.Tags\n\n\t\tfor i, lex := range s {\n\t\t\tw := lex.Value\n\t\t\tt := tags[i]\n\n\t\t\t_, ok := counter[w]\n\t\t\tif !ok {\n\t\t\t\tcounter[w] = make(map[lingo.POSTag]int)\n\t\t\t}\n\t\t\tcounter[w][t]++\n\t\t}\n\t}\n\n\tfreqThresh := 30\n\tambiguityThresh := 0.98\n\n\tfor word, tagCounter := range counter {\n\t\tvar maxTag lingo.POSTag\n\t\tvar max int\n\t\tvar n int\n\t\tfor t, c := range tagCounter {\n\t\t\tif c > max {\n\t\t\t\tmaxTag = t\n\t\t\t\tmax = c\n\t\t\t}\n\t\t\tn += c\n\t\t}\n\n\t\tif n >= freqThresh && float64(max)/float64(n) >= ambiguityThresh {\n\t\t\tp.cachedTags[word] = maxTag\n\t\t}\n\t}\n}\n\nfunc (p *Tagger) shortcut(l lingo.Lexeme) (lingo.POSTag, bool) {\n\ttag, ok := lingo.POSTagShortcut(l)\n\tif !ok {\n\t\ttag, ok = p.cachedTags[l.Value]\n\t}\n\treturn tag, ok\n}\n\nfunc (p *Tagger) setTag(a *lingo.Annotation, tag lingo.POSTag) {\n\tif a == lingo.NullAnnotation() || a == lingo.RootAnnotation() || a == lingo.StartAnnotation() {\n\t\treturn\n\t}\n\n\ta.POSTag = tag\n\n\tif lemmas, err := p.Lemmatize(a.Value, tag); err == nil && len(lemmas) > 0 {\n\t\t// sort.Strings(lemmas)\n\t\ta.Lemma = lemmas[0]\n\t}\n\n\tif stem, err := p.Stem(a.Value); err == nil {\n\t\ta.Stem = stem\n\t}\n}\n\n// Progress is just a tuple of training progress info\ntype Progress struct {\n\tIter, Correct, Count, ShortCutted int\n}\n"
  },
  {
    "path": "pos/release.go",
    "content": "// +build !debug\n\npackage pos\n\nconst BUILD_DEBUG = \"POS TAGGER: Release Build\"\n\nvar TABCOUNT uint32 = 0\nvar tracking = false\n\nfunc tabcount() int                                   { return 0 }\nfunc enterLoggingContext()                            {}\nfunc leaveLoggingContext()                            {}\nfunc logf(format string, others ...interface{})       {}\nfunc recoverFrom(format string, attrs ...interface{}) {}\n\nfunc (p *Tagger) ShowWeights() {}\nfunc printShortcuts(p *Tagger) {}\n"
  },
  {
    "path": "pos/sentence.go",
    "content": "package pos\n\nimport \"github.com/chewxy/lingo\"\n\n// \"log\"\n\nfunc (p *Tagger) getSentences() {\n\tdefer close(p.sentences)\n\n\tvar sentence lingo.AnnotatedSentence\n\tsentence = append(sentence, lingo.RootAnnotation())\n\n\tfor lexeme := range p.Input {\n\t\tif lexeme.LexemeType != lingo.EOF {\n\t\t\ta := lingo.NewAnnotation()\n\t\t\ta.Lexeme = lexeme\n\t\t\tif err := a.Process(p); err != nil {\n\t\t\t\tpanic(err) // for now\n\t\t\t}\n\t\t\tsentence = append(sentence, a)\n\t\t} else {\n\t\t\tp.sentences <- sentence\n\n\t\t\t// reset\n\t\t\tsentence = lingo.AnnotatedSentence{lingo.RootAnnotation()}\n\t\t}\n\n\t\t// TODO: Sentence splitting\n\t}\n}\n"
  },
  {
    "path": "pos/test_test.go",
    "content": "package pos\n\nimport (\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/kljensen/snowball\"\n)\n\ntype dummyLem struct{}\n\nfunc (dummyLem) Lemmatize(s string, pt lingo.POSTag) ([]string, error) {\n\tif len(s) > 3 {\n\t\treturn []string{\n\t\t\ts[:2],\n\t\t}, nil\n\t}\n\treturn []string{\"\"}, nil\n}\n\ntype dummyStemmer struct{}\n\nfunc (dummyStemmer) Stem(s string) (string, error) {\n\treturn snowball.Stem(s, \"english\", true)\n}\n\nvar clusters = map[string]lingo.Cluster{\n\t\"TEst\": 1,\n\t\"Test\": 1,\n\t\"test\": 1,\n}\n\ntype dummyFix struct {\n\tdummyStemmer\n\tdummyLem\n}\n\nfunc (dummyFix) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil }\n\nconst conllu = `1\tFrom\tfrom\tADP\tIN\t_\t3\tcase\t_\t_\n2\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t3\tdet\t_\t_\n3\tAP\tAP\tPROPN\tNNP\tNumber=Sing\t4\tnmod\t_\t_\n4\tcomes\tcome\tVERB\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t0\troot\t_\t_\n5\tthis\tthis\tDET\tDT\tNumber=Sing|PronType=Dem\t6\tdet\t_\t_\n6\tstory\tstory\tNOUN\tNN\tNumber=Sing\t4\tnsubj\t_\t_\n7\t:\t:\tPUNCT\t:\t_\t4\tpunct\t_\t_\n\n1\tPresident\tPresident\tPROPN\tNNP\tNumber=Sing\t2\tcompound\t_\t_\n2\tBush\tBush\tPROPN\tNNP\tNumber=Sing\t5\tnsubj\t_\t_\n3\ton\ton\tADP\tIN\t_\t4\tcase\t_\t_\n4\tTuesday\tTuesday\tPROPN\tNNP\tNumber=Sing\t5\tnmod\t_\t_\n5\tnominated\tnominate\tVERB\tVBD\tMood=Ind|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\ttwo\ttwo\tNUM\tCD\tNumType=Card\t7\tnummod\t_\t_\n7\tindividuals\tindividual\tNOUN\tNNS\tNumber=Plur\t5\tdobj\t_\t_\n8\tto\tto\tPART\tTO\t_\t9\tmark\t_\t_\n9\treplace\treplace\tVERB\tVB\tVerbForm=Inf\t5\tadvcl\t_\t_\n10\tretiring\tretire\tVERB\tVBG\tVerbForm=Ger\t11\tamod\t_\t_\n11\tjurists\tjurist\tNOUN\tNNS\tNumber=Plur\t9\tdobj\t_\t_\n12\ton\ton\tADP\tIN\t_\t14\tcase\t_\t_\n13\tfederal\tfederal\tADJ\tJJ\tDegree=Pos\t14\tamod\t_\t_\n14\tcourts\tcourt\tNOUN\tNNS\tNumber=Plur\t11\tnmod\t_\t_\n15\tin\tin\tADP\tIN\t_\t18\tcase\t_\t_\n16\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t18\tdet\t_\t_\n17\tWashington\tWashington\tPROPN\tNNP\tNumber=Sing\t18\tcompound\t_\t_\n18\tarea\tarea\tNOUN\tNN\tNumber=Sing\t14\tnmod\t_\t_\n19\t.\t.\tPUNCT\t.\t_\t5\tpunct\t_\t_\n\n1\tBush\tBush\tPROPN\tNNP\tNumber=Sing\t2\tnsubj\t_\t_\n2\tnominated\tnominate\tVERB\tVBD\tMood=Ind|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n3\tJennifer\tJennifer\tPROPN\tNNP\tNumber=Sing\t5\tcompound\t_\t_\n4\tM.\tM.\tPROPN\tNNP\tNumber=Sing\t5\tcompound\t_\t_\n5\tAnderson\tAnderson\tPROPN\tNNP\tNumber=Sing\t2\tdobj\t_\t_\n6\tfor\tfor\tADP\tIN\t_\t11\tcase\t_\t_\n7\ta\ta\tDET\tDT\tDefinite=Ind|PronType=Art\t11\tdet\t_\t_\n8\t15\t15\tNUM\tCD\tNumType=Card\t10\tnummod\t_\t_\n9\t-\t-\tPUNCT\tHYPH\t_\t10\tpunct\t_\t_\n10\tyear\tyear\tNOUN\tNN\tNumber=Sing\t11\tcompound\t_\t_\n11\tterm\tterm\tNOUN\tNN\tNumber=Sing\t2\tnmod\t_\t_\n12\tas\tas\tADP\tIN\t_\t14\tcase\t_\t_\n13\tassociate\tassociate\tADJ\tJJ\tDegree=Pos\t14\tamod\t_\t_\n14\tjudge\tjudge\tNOUN\tNN\tNumber=Sing\t11\tnmod\t_\t_\n15\tof\tof\tADP\tIN\t_\t18\tcase\t_\t_\n16\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t18\tdet\t_\t_\n17\tSuperior\tSuperior\tPROPN\tNNP\tNumber=Sing\t18\tcompound\t_\t_\n18\tCourt\tCourt\tPROPN\tNNP\tNumber=Sing\t14\tnmod\t_\t_\n19\tof\tof\tADP\tIN\t_\t21\tcase\t_\t_\n20\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t21\tdet\t_\t_\n21\tDistrict\tDistrict\tPROPN\tNNP\tNumber=Sing\t18\tnmod\t_\t_\n22\tof\tof\tADP\tIN\t_\t23\tcase\t_\t_\n23\tColumbia\tColumbia\tPROPN\tNNP\tNumber=Sing\t21\tnmod\t_\t_\n24\t,\t,\tPUNCT\t,\t_\t2\tpunct\t_\t_\n25\treplacing\treplace\tVERB\tVBG\tVerbForm=Ger\t2\tadvcl\t_\t_\n26\tSteffen\tSteffen\tPROPN\tNNP\tNumber=Sing\t28\tcompound\t_\t_\n27\tW.\tW.\tPROPN\tNNP\tNumber=Sing\t28\tcompound\t_\t_\n28\tGraae\tGraae\tPROPN\tNNP\tNumber=Sing\t25\tdobj\t_\t_\n29\t.\t.\tPUNCT\t.\t_\t2\tpunct\t_\t_\n\n1\tWe\twe\tPRON\tPRP\tCase=Nom|Number=Plur|Person=1|PronType=Prs\t3\tnsubj\t_\t_\n2\t've\thave\tAUX\tVBP\tMood=Ind|Tense=Pres|VerbForm=Fin\t3\taux\t_\t_\n3\tgrown\tgrow\tVERB\tVBN\tTense=Past|VerbForm=Part\t0\troot\t_\t_\n4\tup\tup\tADP\tRP\t_\t3\tcompound:prt\t_\t_\n5\t.\t.\tPUNCT\t.\t_\t3\tpunct\t_\t_`\n"
  },
  {
    "path": "pos/util.go",
    "content": "package pos\n\nimport (\n\t\"math\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\nfunc maxScore(scores *[lingo.MAXTAG]float64) lingo.POSTag {\n\tvar maxClass lingo.POSTag\n\tmaxVal := -math.MaxFloat64\n\tfor c, v := range scores {\n\t\tif v > maxVal {\n\t\t\tmaxClass = lingo.POSTag(c)\n\t\t\tmaxVal = v\n\t\t}\n\t}\n\n\treturn maxClass\n}\n"
  },
  {
    "path": "pos/util_test.go",
    "content": "package pos\n\nimport (\n\t\"math\"\n\t\"math/rand\"\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\nfunc TestMaxScore(t *testing.T) {\n\trand.Seed(1337)\n\tscores := new([lingo.MAXTAG]float64)\n\n\tfor i := range scores {\n\t\tscores[i] = rand.Float64()\n\t\tif lingo.POSTag(i) == lingo.ROOT_TAG {\n\t\t\tscores[i] = math.MaxFloat64\n\t\t}\n\t}\n\n\ttag := maxScore(scores)\n\tif tag != lingo.ROOT_TAG {\n\t\tt.Errorf(\"Expected Score #10 to be the max. Got %d instead\", tag)\n\t}\n}\n"
  },
  {
    "path": "sentence.go",
    "content": "package lingo\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"sort\"\n\t\"strings\"\n\n\t\"github.com/pkg/errors\"\n)\n\n/* Lexeme Sentence */\ntype LexemeSentence []Lexeme\n\nfunc NewLexemeSentence() LexemeSentence { return LexemeSentence(make([]Lexeme, 0)) }\n\nfunc (ls LexemeSentence) String() string {\n\tvar buf bytes.Buffer\n\tfor _, lex := range ls {\n\t\tbuf.WriteString(lex.Value)\n\t\tbuf.WriteString(\" \")\n\t}\n\treturn strings.Trim(buf.String(), \" \")\n}\n\n/* Annotated Sentence */\n\n// AnnotatedSentence is a sentence, but each word has been annotated.\ntype AnnotatedSentence []*Annotation\n\nfunc NewAnnotatedSentence() AnnotatedSentence { return make(AnnotatedSentence, 0) }\n\nfunc (as AnnotatedSentence) Clone() AnnotatedSentence {\n\tretVal := make(AnnotatedSentence, len(as))\n\n\tfor i, a := range as {\n\t\t// don't clone rootAnnotation\n\t\tif i == 0 && a == rootAnnotation {\n\t\t\tretVal[i] = a\n\t\t\tcontinue\n\t\t}\n\t\tretVal[i] = a.Clone()\n\t}\n\treturn retVal\n}\n\nfunc (as AnnotatedSentence) SetID() {\n\tfor i, a := range as {\n\t\tif i == 0 && a == rootAnnotation {\n\t\t\tcontinue\n\t\t}\n\t\ta.ID = i\n\t}\n}\n\nfunc (as AnnotatedSentence) Fix() {\n\tif as[0].Lexeme == rootLexeme {\n\t\tas[0] = rootAnnotation\n\t}\n\n\tas.SetID()\n\n\tfor _, a := range as {\n\t\tif a.Head != nil {\n\t\t\tif a.HeadID() == -1 && a.Head.Lexeme == rootLexeme {\n\t\t\t\ta.Head = rootAnnotation\n\t\t\t\tcontinue\n\t\t\t}\n\t\t\ta.SetHead(as[a.HeadID()])\n\t\t}\n\t}\n}\n\nfunc (as AnnotatedSentence) IsValid() bool {\n\t// check that IDs are set\n\tzeroes := 0\n\tfor _, a := range as {\n\t\tif a.ID == 0 {\n\t\t\tzeroes++\n\t\t}\n\t}\n\t// IDs not properly set\n\tif zeroes > 1 {\n\t\treturn false\n\t}\n\n\t// TODO\n\t// check that there is only one root\n\n\treturn true\n}\n\n/* Return slices of x */\n\n// Phrase returns the slice of the sentence. While you can do the same by simply doing as[start:end], this method returns errors instead of panicking\nfunc (as AnnotatedSentence) Phrase(start, end int) (AnnotatedSentence, error) {\n\tif start < 0 {\n\t\treturn nil, errors.Errorf(\"Start: %d < 0\", start)\n\t}\n\tif end > len(as) {\n\t\treturn nil, errors.Errorf(\"End: %d > len(as): %d\", end, len(as))\n\t}\n\treturn as[start:end], nil\n}\n\n// IDs returns the list of IDs in the sentence. The return value has exactly the same length as the sentence.\nfunc (as AnnotatedSentence) IDs() []int {\n\tretVal := make([]int, len(as))\n\tfor i, a := range as {\n\t\tretVal[i] = a.ID\n\t}\n\treturn retVal\n}\n\n// Tags returns the POSTags of the sentence. The return value has exactly the same length as the sentence.\nfunc (as AnnotatedSentence) Tags() []POSTag {\n\tretVal := make([]POSTag, len(as))\n\tfor i, a := range as {\n\t\tretVal[i] = a.POSTag\n\t}\n\treturn retVal\n}\n\n// Heads returns the head IDs of the sentence. The return value has exactly the same length as the sentence.\nfunc (as AnnotatedSentence) Heads() []int {\n\tretVal := make([]int, len(as))\n\tfor i, a := range as {\n\t\tretVal[i] = a.HeadID()\n\t}\n\treturn retVal\n}\n\n// Leaves returns the *Annotations which are leaves. If the dependency hasn't been set yet, every single *Annotation is a leaf.\nfunc (as AnnotatedSentence) Leaves() (retVal []int) {\n\tfor i := range as {\n\t\tif len(as.Children(i)) == 0 {\n\t\t\tretVal = append(retVal, i)\n\t\t}\n\t}\n\treturn\n}\n\n// Labels returns the DependencyTypes of the sentence. The return value has exactly the same length as the sentence.\nfunc (as AnnotatedSentence) Labels() []DependencyType {\n\tretVal := make([]DependencyType, len(as))\n\tfor i, a := range as {\n\t\tretVal[i] = a.DependencyType\n\t}\n\treturn retVal\n}\n\n// StringSlice returns the original words as a slice of string. The return value has exactly the same length as the sentence.\nfunc (as AnnotatedSentence) StringSlice() []string {\n\tretVal := make([]string, len(as), len(as))\n\tfor i, a := range as {\n\t\tretVal[i] = a.Value\n\t}\n\treturn retVal\n}\n\n// LoweredStringSlice returns the lowercased version of the words in the sentence as a slice of string. The return value has exactly the same length as the sentence.\nfunc (as AnnotatedSentence) LoweredStringSlice() []string {\n\tretVal := make([]string, len(as), len(as))\n\tfor i, a := range as {\n\t\tretVal[i] = a.Lowered\n\t}\n\treturn retVal\n}\n\n// Lemmas returns the lemmas as as slice of string. The return value has exactly the same length as the sentence.\nfunc (as AnnotatedSentence) Lemmas() []string {\n\tlemmas := make([]string, len(as))\n\tfor i, a := range as {\n\t\tlemmas[i] = a.Lemma\n\t}\n\treturn lemmas\n}\n\n// Stems returns the stems as a slice of string. The return value has exactly the same length as the sentence.\nfunc (as AnnotatedSentence) Stems() []string {\n\tstems := make([]string, len(as))\n\tfor i, a := range as {\n\t\tstems[i] = a.Stem\n\t}\n\treturn stems\n}\n\nfunc (as AnnotatedSentence) Children(h int) (retVal []int) {\n\tfor i, v := range as {\n\t\tif v.HeadID() == h {\n\t\t\tretVal = append(retVal, i)\n\t\t}\n\t}\n\treturn\n}\n\nfunc (as AnnotatedSentence) Edges() (retVal []DependencyEdge) {\n\tfor _, a := range as {\n\t\tvar head = -1\n\n\t\tif a.Head != nil {\n\t\t\thead = a.HeadID()\n\t\t}\n\n\t\tif head == -1 {\n\t\t\thead = 0\n\t\t}\n\t\tedge := DependencyEdge{as[head], a, a.DependencyType}\n\t\tretVal = append(retVal, edge)\n\t}\n\tsort.Sort(edgeByID(retVal))\n\treturn\n}\n\n/* To other structures */\n\nfunc (as AnnotatedSentence) Dependency() *Dependency {\n\treturn NewDependency(FromAnnotatedSentence(as))\n}\n\nfunc (as AnnotatedSentence) Tree() *DependencyTree {\n\ttracker := make([]*DependencyTree, len(as))\n\n\trootNode := NewDependencyTree(nil, 0, rootAnnotation)\n\ttracker[0] = rootNode\n\n\tfor i := 1; i < len(as); i++ {\n\t\thead := as[i].HeadID()\n\t\tvar headDep *DependencyTree\n\n\t\tif head == -1 {\n\t\t\theadDep = rootNode\n\t\t} else {\n\t\t\theadDep = tracker[head]\n\t\t}\n\n\t\tif headDep == nil {\n\t\t\t// make a dependency for the head\n\t\t\theadDep = NewDependencyTree(nil, head, as[head])\n\t\t\ttracker[head] = headDep\n\t\t}\n\n\t\tdep := tracker[i]\n\n\t\tif dep == nil {\n\t\t\tdep = NewDependencyTree(headDep, i, as[i])\n\t\t\ttracker[i] = dep\n\t\t} else {\n\t\t\tdep.Parent = headDep\n\t\t}\n\n\t\theadDep.AddChild(dep)\n\t\tdep.Type = as[i].DependencyType\n\n\t}\n\t// return tracker[len(tracker)-1]\n\t// log.Printf(\"Tracker: %v, len(as): %d. Root: %v\", tracker, len(as), rootNode.Children)\n\treturn rootNode\n}\n\n// Stringer interface\n\nfunc (as AnnotatedSentence) String() string {\n\tvar buf bytes.Buffer\n\tfor i, a := range as {\n\t\tbuf.WriteString(fmt.Sprintf(\"%s/%s\", a.Value, a.POSTag))\n\t\tif i < len(as)-1 {\n\t\t\tbuf.WriteString(\" \")\n\t\t}\n\t}\n\treturn buf.String()\n}\n\nfunc (as AnnotatedSentence) ValueString() string {\n\tvar buf bytes.Buffer\n\tfor i, a := range as {\n\t\tbuf.WriteString(a.Value)\n\t\tif i < len(as)-1 {\n\t\t\tbuf.WriteString(\" \")\n\t\t}\n\t}\n\treturn buf.String()\n}\n\nfunc (as AnnotatedSentence) LoweredString() string {\n\tvar buf bytes.Buffer\n\tfor i, a := range as {\n\t\tbuf.WriteString(a.Lowered)\n\t\tif i < len(as)-1 {\n\t\t\tbuf.WriteString(\" \")\n\t\t}\n\t}\n\treturn buf.String()\n}\n\nfunc (as AnnotatedSentence) LemmaString() string {\n\tvar buf bytes.Buffer\n\tfor i, a := range as {\n\t\tbuf.WriteString(a.Lemma)\n\t\tif i < len(as)-1 {\n\t\t\tbuf.WriteString(\" \")\n\t\t}\n\t}\n\treturn buf.String()\n}\n\nfunc (as AnnotatedSentence) StemString() string {\n\tvar buf bytes.Buffer\n\tfor i, a := range as {\n\t\tbuf.WriteString(a.Stem)\n\t\tif i < len(as)-1 {\n\t\t\tbuf.WriteString(\" \")\n\t\t}\n\t}\n\treturn buf.String()\n}\n\n// sort interface\nfunc (as AnnotatedSentence) Len() int           { return len(as) }\nfunc (as AnnotatedSentence) Swap(i, j int)      { as[i], as[j] = as[j], as[i] }\nfunc (as AnnotatedSentence) Less(i, j int) bool { return as[i].ID < as[j].ID }\n"
  },
  {
    "path": "sets.go",
    "content": "package lingo\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n)\n\n/* TAG SET */\n\n// TagSet is a set of all the POSTags\ntype TagSet [MAXTAG]bool\n\nfunc (ts TagSet) String() string {\n\tvar buf bytes.Buffer\n\tfor t, v := range ts {\n\t\tbuf.WriteString(fmt.Sprintf(\"%v: %v\\n\", POSTag(t), v))\n\t}\n\treturn buf.String()\n}\n\n// DependencyTypeSet is a set of all the DependencyTypes\ntype DependencyTypeSet [MAXDEPTYPE]bool\n\nfunc (dts DependencyTypeSet) String() string {\n\tvar buf bytes.Buffer\n\tfor t, v := range dts {\n\t\tbuf.WriteString(fmt.Sprintf(\"%v: %v\\n\", DependencyType(t), v))\n\t}\n\treturn buf.String()\n}\n"
  },
  {
    "path": "shape.go",
    "content": "package lingo\n\nimport (\n\t\"bytes\"\n\t\"unicode\"\n)\n\n// Shape represents the shape of a word. It's currently implemented as a string\ntype Shape string\n\nfunc (l Lexeme) Shape() Shape {\n\ts := l.Value\n\n\tif len(s) > 50 {\n\t\treturn Shape(\"Long\")\n\t}\n\n\tvar buf bytes.Buffer\n\n\tpreviousCharShape := ' '\n\tcurrentCharShape := ' '\n\tsequence := 0\n\tfor _, c := range s {\n\t\tswitch {\n\t\tcase unicode.IsLetter(c):\n\t\t\tif unicode.IsUpper(c) {\n\t\t\t\tcurrentCharShape = 'X'\n\t\t\t} else {\n\t\t\t\tcurrentCharShape = 'x'\n\t\t\t}\n\n\t\tcase unicode.IsDigit(c):\n\t\t\tcurrentCharShape = 'd'\n\n\t\tcase l.LexemeType == URI:\n\t\t\treturn Shape(\"URI\")\n\n\t\tdefault:\n\t\t\tcurrentCharShape = c\n\t\t}\n\n\t\tif previousCharShape == currentCharShape {\n\t\t\tsequence++\n\t\t} else {\n\t\t\tsequence = 0 // reset the sequence\n\t\t\tpreviousCharShape = currentCharShape\n\t\t}\n\n\t\tif sequence < 4 {\n\t\t\tbuf.WriteRune(currentCharShape)\n\t\t}\n\t}\n\n\tretVal := buf.String()\n\n\treturn Shape(retVal)\n}\n"
  },
  {
    "path": "stopwords.go",
    "content": "package lingo\n\nimport \"strings\"\n\nconst sw = `a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can cannot cant co computer con could couldnt cry de describe detail did didn do does doesn doing don done down due during each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen fify fill find fire first five for former formerly forty found four from front full further get give go had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie if in inc indeed interest into is it its itself just keep kg km last latter latterly least less ltd made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off often on once one only onto or other others otherwise our ours ourselves out over own part per perhaps please put quite rather re really regarding same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under unless until up upon us used using various very via was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves`\n\nvar stopwords = make(map[string]struct{})\n\nfunc init() {\n\tfor _, s := range strings.Split(sw, \" \") {\n\t\tstopwords[s] = empty\n\t}\n\n}\n\nvar specials = `-ROOT- -UNKNOWN-`\n\nfunc UnescapeSpecials(word string) string {\n\tswitch word {\n\tcase \"-LRB-\":\n\t\treturn \"(\"\n\tcase \"-RRB-\":\n\t\treturn \")\"\n\tcase \"``\":\n\t\treturn \"\\\"\"\n\tcase \"-NULL-\":\n\t\treturn \"\"\n\t}\n\treturn word\n}\n"
  },
  {
    "path": "treebank/const_postag_stanford.go",
    "content": "// +build stanfordtags\n\npackage treebank\n\nimport \"github.com/chewxy/lingo\"\n\nvar posTagTable map[string]lingo.POSTag = map[string]lingo.POSTag{\n\t\"X\": lingo.X,\n\n\t\"CC\":   lingo.CC,\n\t\"CD\":   lingo.CD,\n\t\"DT\":   lingo.DT,\n\t\"EX\":   lingo.EX,\n\t\"FW\":   lingo.FW,\n\t\"IN\":   lingo.IN,\n\t\"JJ\":   lingo.JJ,\n\t\"JJR\":  lingo.JJR,\n\t\"JJS\":  lingo.JJS,\n\t\"LS\":   lingo.LS,\n\t\"MD\":   lingo.MD,\n\t\"NN\":   lingo.NN,\n\t\"NNS\":  lingo.NNS,\n\t\"NNP\":  lingo.NNP,\n\t\"NNPS\": lingo.NNPS,\n\t\"PDT\":  lingo.PDT,\n\t\"POS\":  lingo.POS,\n\t\"PRP\":  lingo.PRP,\n\t\"PPRP\": lingo.PPRP,\n\t\"PRP$\": lingo.PPRP,\n\t\"RB\":   lingo.RB,\n\t\"RBR\":  lingo.RBR,\n\t\"RBS\":  lingo.RBS,\n\t\"RP\":   lingo.RP,\n\t\"SYM\":  lingo.SYM,\n\t\"TO\":   lingo.TO,\n\t\"UH\":   lingo.UH,\n\t\"VB\":   lingo.VB,\n\t\"VBD\":  lingo.VBD,\n\t\"VBG\":  lingo.VBG,\n\t\"VBN\":  lingo.VBN,\n\t\"VBP\":  lingo.VBP,\n\t\"VBZ\":  lingo.VBZ,\n\t\"WDT\":  lingo.WDT,\n\t\"WP\":   lingo.WP,\n\t\"PWP\":  lingo.PWP,\n\t\"WP$\":  lingo.PWP,\n\t\"WRB\":  lingo.WRB,\n\n\t// punctuation\n\t\",\":     lingo.COMMA,\n\t\"``\":    lingo.OPENQUOTE,\n\t\"''\":    lingo.CLOSEQUOTE,\n\t\".\":     lingo.FULLSTOP,\n\t\":\":     lingo.COLON,\n\t\"$\":     lingo.DOLLAR,\n\t\"#\":     lingo.HASHSIGN,\n\t\"-LRB-\": lingo.LEFTBRACE,\n\t\"-RRB-\": lingo.RIGHTBRACE,\n\n\t\"ADD\":  lingo.ADD,\n\t\"NFP\":  lingo.NFP,\n\t\"HYPH\": lingo.HYPH,\n\t\"GW\":   lingo.GW,\n\t\"AFX\":  lingo.AFX,\n\t\"XX\":   lingo.XX,\n\n\t\"-NULL-\":    lingo.X,\n\t\"-ROOT-\":    lingo.ROOT_TAG,\n\t\"-UNKNOWN-\": lingo.UNKNOWN_TAG,\n}\n"
  },
  {
    "path": "treebank/const_postag_universal.go",
    "content": "// +build !stanfordtags\n\npackage treebank\n\nimport \"github.com/chewxy/lingo\"\n\nvar posTagTable map[string]lingo.POSTag = map[string]lingo.POSTag{\n\t\"X\":     lingo.X,\n\t\"ADJ\":   lingo.ADJ,\n\t\"ADP\":   lingo.ADP,\n\t\"ADV\":   lingo.ADV,\n\t\"AUX\":   lingo.AUX,\n\t\"CONJ\":  lingo.CONJ,\n\t\"DET\":   lingo.DET,\n\t\"INTJ\":  lingo.INTJ,\n\t\"NOUN\":  lingo.NOUN,\n\t\"NUM\":   lingo.NUM,\n\t\"PART\":  lingo.PART,\n\t\"PRON\":  lingo.PRON,\n\t\"PROPN\": lingo.PROPN,\n\t\"PUNCT\": lingo.PUNCT,\n\t\"SCONJ\": lingo.SCONJ,\n\t\"SYM\":   lingo.SYM,\n\t\"VERB\":  lingo.VERB,\n\n\t\"-NULL-\":    lingo.X,\n\t\"-ROOT-\":    lingo.ROOT_TAG,\n\t\"-UNKNOWN-\": lingo.UNKNOWN_TAG,\n}\n"
  },
  {
    "path": "treebank/const_rel_stanford.go",
    "content": "// +build stanfordrel\n\npackage treebank\n\nimport \"github.com/chewxy/lingo\"\n\nvar dependencyTable map[string]lingo.DependencyType = map[string]lingo.DependencyType{\n\t\"root\":       lingo.Root,\n\t\"dep\":        lingo.Dep,\n\t\"aux\":        lingo.Aux,\n\t\"auxpass\":    lingo.AuxPass,\n\t\"cop\":        lingo.Cop,\n\t\"arg\":        lingo.Arg,\n\t\"agent\":      lingo.Agent,\n\t\"comp\":       lingo.Comp,\n\t\"acomp\":      lingo.AComp,\n\t\"ccomp\":      lingo.CComp,\n\t\"xcomp\":      lingo.XComp,\n\t\"obj\":        lingo.Obj,\n\t\"dobj\":       lingo.DObj,\n\t\"iobj\":       lingo.IObj,\n\t\"pobj\":       lingo.PObj,\n\t\"subj\":       lingo.Subj,\n\t\"nsubj\":      lingo.NSubj,\n\t\"nsubjpass\":  lingo.NSubjPass,\n\t\"csubj\":      lingo.CSubj,\n\t\"csubjpass\":  lingo.CSubjPass,\n\t\"cc\":         lingo.Coordination,\n\t\"conj\":       lingo.Conj,\n\t\"expl\":       lingo.Expl,\n\t\"mod\":        lingo.Mod,\n\t\"amod\":       lingo.AMod,\n\t\"appos\":      lingo.Appos,\n\t\"advcl\":      lingo.Advcl,\n\t\"det\":        lingo.Det,\n\t\"predet\":     lingo.Predet,\n\t\"preconj\":    lingo.Preconj,\n\t\"vmod\":       lingo.Vmod,\n\t\"mwe\":        lingo.MWE,\n\t\"mark\":       lingo.Mark,\n\t\"advmod\":     lingo.AdvMod,\n\t\"neg\":        lingo.Neg,\n\t\"rcmod\":      lingo.RCMod,\n\t\"quantmod\":   lingo.QuantMod,\n\t\"nn\":         lingo.NounMod,\n\t\"npadvmod\":   lingo.NPAdvMod,\n\t\"tmod\":       lingo.TMod,\n\t\"num\":        lingo.Num,\n\t\"number\":     lingo.NumberElement,\n\t\"prep\":       lingo.Prep,\n\t\"poss\":       lingo.Poss,\n\t\"possessive\": lingo.Possessive,\n\t\"prt\":        lingo.PRT,\n\t\"parataxis\":  lingo.Parataxis,\n\t\"goeswith\":   lingo.GoesWith,\n\t\"punct\":      lingo.Punct,\n\t\"ref\":        lingo.Ref,\n\t\"sdep\":       lingo.SDep,\n\t\"xsubj\":      lingo.XSubj,\n\n\t// additional stuff not found in the original, but found in EWT\n\t\"case\":       lingo.Case,\n\t\"compound\":   lingo.Compound,\n\t\"nmod\":       lingo.NMod,\n\t\"discourse\":  lingo.Discourse,\n\t\"nummod\":     lingo.NumMod,\n\t\"relcl\":      lingo.RelCl,\n\t\"nfincl\":     lingo.NFinCl,\n\t\"nmod:poss\":  lingo.NMod_Poss,\n\t\"nmod:npmod\": lingo.NMod_NPMod,\n\t\"vocative\":   lingo.Vocative,\n\t\"list\":       lingo.List,\n\t\"mwprep\":     lingo.MWPrep,\n\t\"remnant\":    lingo.Remnant,\n\t\"acl\":        lingo.Acl,\n\t\"npmod\":      lingo.NPMod,\n\t\"mdvod\":      lingo.MDVod,\n\t\"detmod\":     lingo.DetMod,\n\n\t// found in NNParser\n\t\"pcomp\": lingo.PComp,\n\n\t\"-NULL-\": lingo.Dep,\n}\n"
  },
  {
    "path": "treebank/const_rel_universal.go",
    "content": "// +build !stanfordrel\n\npackage treebank\n\nimport \"github.com/chewxy/lingo\"\n\nvar dependencyTable map[string]lingo.DependencyType = map[string]lingo.DependencyType{\n\t\"dep\":          lingo.Dep,\n\t\"root\":         lingo.Root,\n\t\"nsubj\":        lingo.NSubj,\n\t\"nsubjpass\":    lingo.NSubjPass,\n\t\"dobj\":         lingo.DObj,\n\t\"iobj\":         lingo.IObj,\n\t\"csubj\":        lingo.CSubj,\n\t\"csubjpass\":    lingo.CSubjPass,\n\t\"ccomp\":        lingo.CComp,\n\t\"xcomp\":        lingo.XComp,\n\t\"nummod\":       lingo.NumMod,\n\t\"appos\":        lingo.Appos,\n\t\"nmod\":         lingo.NMod,\n\t\"acl\":          lingo.ACl,\n\t\"acl:relcl\":    lingo.ACl_RelCl,\n\t\"det\":          lingo.Det,\n\t\"det:predet\":   lingo.Det_PreDet,\n\t\"amod\":         lingo.AMod,\n\t\"neg\":          lingo.Neg,\n\t\"case\":         lingo.Case,\n\t\"nmod:npmod\":   lingo.NMod_NPMod,\n\t\"nmod:tmod\":    lingo.NMod_TMod,\n\t\"nmod:poss\":    lingo.NMod_Poss,\n\t\"advcl\":        lingo.AdvCl,\n\t\"advmod\":       lingo.AdvMod,\n\t\"compound\":     lingo.Compound,\n\t\"compound:prt\": lingo.Compound_Part,\n\t\"name\":         lingo.Name,\n\t\"mwe\":          lingo.MWE,\n\t\"foreign\":      lingo.Foreign,\n\t\"goeswith\":     lingo.GoesWith,\n\t\"list\":         lingo.List,\n\t\"dislocated\":   lingo.Dislocated,\n\t\"parataxis\":    lingo.Parataxis,\n\t\"remnant\":      lingo.Remnant,\n\t\"reparandum\":   lingo.Reparandum,\n\t\"vocative\":     lingo.Vocative,\n\t\"discourse\":    lingo.Discourse,\n\t\"expl\":         lingo.Expl,\n\t\"aux\":          lingo.Aux,\n\t\"auxpass\":      lingo.AuxPass,\n\t\"cop\":          lingo.Cop,\n\t\"mark\":         lingo.Mark,\n\t\"punct\":        lingo.Punct,\n\t\"conj\":         lingo.Conj,\n\t\"cc\":           lingo.Coordination,\n\t\"cc:preconj\":   lingo.CC_PreConj, // https://github.com/UniversalDependencies/docs/issues/221\n\t\"conj:preconj\": lingo.CC_PreConj, // https://github.com/UniversalDependencies/docs/issues/221\n\n\t\"-NULL-\": lingo.NoDepType,\n}\n"
  },
  {
    "path": "treebank/sentenceTag.go",
    "content": "package treebank\n\nimport (\n\t\"math/rand\"\n\n\t\"github.com/chewxy/lingo\"\n)\n\n// SentenceTag is a struc that holds a sentence, tags, heads and labels\ntype SentenceTag struct {\n\tSentence lingo.LexemeSentence\n\tTags     []lingo.POSTag\n\tHeads    []int\n\tLabels   []lingo.DependencyType\n}\n\nfunc (s SentenceTag) AnnotatedSentence(f lingo.AnnotationFixer) lingo.AnnotatedSentence {\n\tretVal := lingo.NewAnnotatedSentence()\n\tretVal = append(retVal, lingo.RootAnnotation())\n\n\tfor i, lex := range s.Sentence {\n\t\ta := lingo.NewAnnotation()\n\t\ta.Lexeme = lex\n\t\ta.POSTag = s.Tags[i]\n\t\ta.DependencyType = s.Labels[i]\n\n\t\t// should panic, because SentenceTag is only ever used during training\n\t\tif err := a.Process(f); err != nil {\n\t\t\tpanic(err)\n\t\t}\n\n\t\tretVal = append(retVal, a)\n\t}\n\n\t// add heads\n\tfor i, a := range retVal {\n\t\tif i == 0 {\n\t\t\tcontinue\n\t\t}\n\t\ta.SetHead(retVal[s.Heads[i-1]])\n\t}\n\n\tretVal.Fix()\n\n\treturn retVal\n}\n\nfunc (s SentenceTag) Dependency(f lingo.AnnotationFixer) *lingo.Dependency {\n\tsentence := s.AnnotatedSentence(f)\n\tdep := sentence.Dependency()\n\n\treturn dep\n}\n\nfunc (s SentenceTag) String() string {\n\treturn s.Sentence.String()\n}\n\nfunc ShuffleSentenceTag(s []SentenceTag) []SentenceTag {\n\trand.Seed(1337)\n\tfor i := range s {\n\t\tj := rand.Intn(i + 1)\n\t\ts[i], s[j] = s[j], s[i]\n\t}\n\n\treturn s\n}\n\n/* UTILITY FUNCTIONS */\n\nfunc WrapLexemeSentence(sentence lingo.LexemeSentence) lingo.LexemeSentence {\n\tretSentence := lingo.NewLexemeSentence()\n\tretSentence = append(retSentence, lingo.StartLexeme())\n\tretSentence = append(retSentence, sentence...)\n\tretSentence = append(retSentence, lingo.RootLexeme())\n\treturn retSentence\n}\n\nfunc WrapTags(tagList []lingo.POSTag) []lingo.POSTag {\n\tretVal := append([]lingo.POSTag{lingo.X}, tagList...)\n\tretVal = append(retVal, lingo.X)\n\treturn retVal\n}\n\nfunc WrapHeads(heads []int) []int {\n\tretVal := append([]int{0}, heads...)\n\tretVal = append(retVal, 0)\n\treturn retVal\n}\n\nfunc WrapDeps(deps []lingo.DependencyType) []lingo.DependencyType {\n\tretVal := append([]lingo.DependencyType{lingo.Dep}, deps...)\n\tretVal = append(retVal, lingo.Dep)\n\treturn retVal\n}\n"
  },
  {
    "path": "treebank/sentenceTag_test.go",
    "content": "package treebank\n\nimport (\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestSentenceTag(t *testing.T) {\n\tassert := assert.New(t)\n\treadr := strings.NewReader(sampleConllu)\n\tst := ReadConllu(readr)[0]\n\n\tcorrectHeads := []int{2, 5, 4, 5, 0, 7, 5, 9, 5, 11, 9, 14, 14, 11, 18, 18, 18, 14, 5}\n\tassert.Equal(correctHeads, st.Heads)\n\n\tdep := st.Dependency(nil)\n\tassert.Equal(correctHeads, dep.Heads()[1:])\n}\n"
  },
  {
    "path": "treebank/treebank.go",
    "content": "package treebank\n\nimport (\n\t\"archive/zip\"\n\t\"io\"\n\t\"log\"\n\n\t\"github.com/chewxy/lingo\"\n\n\t\"bufio\"\n\t\"os\"\n\t\"strconv\"\n\t\"strings\"\n)\n\nvar empty struct{}\n\n// Loader is anything that loads into a slice of SentenceTags. For future uses, to load tree banks\ntype Loader func(string) []SentenceTag\n\n// LoadUniversal loads a treebank file formatted in a CONLLU format\nfunc LoadUniversal(fileName string) []SentenceTag {\n\tf, err := os.Open(fileName)\n\tif err != nil {\n\t\tlog.Printf(\"filename %q\", fileName)\n\t\tpanic(err)\n\t}\n\tdefer f.Close()\n\n\treturn ReadConllu(f)\n}\n\n// ReadConllu reads a file formatted in a CONLLU format\nfunc ReadConllu(reader io.Reader) []SentenceTag {\n\ts, st, sh, sdt := reset()\n\tsentences := make([]SentenceTag, 0)\n\tsentenceCount := 0\n\n\tvar usedTags lingo.TagSet\n\tvar usedDepTypes lingo.DependencyTypeSet\n\tvar unknownTags = make(map[string]struct{})\n\tvar unknownDepType = make(map[string]struct{})\n\n\tcolCount := 0\n\tfor bs := bufio.NewScanner(reader); bs.Scan(); colCount++ {\n\t\tl := bs.Text()\n\t\tif strings.HasPrefix(l, \"#\") {\n\t\t\t// comments\n\t\t\tcontinue\n\t\t}\n\t\tif len(l) == 0 {\n\t\t\t// then this is a new sentence\n\t\t\tsentences = finish(s, st, sh, sdt, sentences)\n\t\t\ts, st, sh, sdt = reset()\n\n\t\t\tsentenceCount++\n\t\t\tcontinue\n\t\t}\n\n\t\tcols := strings.Split(l, \"\\t\")\n\t\tword := cols[1]\n\n\t\tvar tag string\n\t\tswitch lingo.BUILD_TAGSET {\n\t\tcase \"stanfordtags\":\n\t\t\ttag = cols[4]\n\t\tcase \"universaltags\":\n\t\t\ttag = cols[3]\n\t\tdefault:\n\t\t\tpanic(\"Unknown tagset\")\n\t\t}\n\n\t\thead := cols[6]\n\t\tdepType := cols[7]\n\n\t\tvar t lingo.POSTag\n\t\tvar dt lingo.DependencyType\n\t\tvar h int\n\t\tvar ok bool\n\t\tvar err error\n\n\t\tword = lingo.UnescapeSpecials(word)\n\n\t\tlexType := StringToLexType(tag)\n\t\tif t, ok = StringToPOSTag(tag); ok {\n\t\t\tusedTags[t] = true\n\t\t} else {\n\t\t\tunknownTags[tag] = empty\n\t\t}\n\n\t\tif h, err = strconv.Atoi(head); err != nil {\n\t\t\tpanic(err) // panic is the right option, because there is no default\n\t\t}\n\n\t\tif dt, ok = StringToDependencyType(depType); ok {\n\t\t\tusedDepTypes[dt] = true\n\t\t} else {\n\t\t\tunknownDepType[depType] = empty\n\t\t}\n\n\t\tlexeme := lingo.Lexeme{word, lexType, sentenceCount, colCount, 0} // TODO: add byte offset\n\t\ts = append(s, lexeme)\n\t\tst = append(st, t)\n\t\tsh = append(sh, h)\n\t\tsdt = append(sdt, dt)\n\t}\n\treturn sentences\n}\n\n// LoadEWT loads a zipped English Web Treebank (as donated by Google)\nfunc LoadEWT(filename string) []SentenceTag {\n\n\tr, err := zip.OpenReader(filename)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\tdefer r.Close()\n\n\tsentences := make([]SentenceTag, 0)\n\n\tfor _, f := range r.File {\n\t\tcontents, err := f.Open()\n\t\tif err != nil {\n\t\t\tpanic(err)\n\t\t}\n\t\tsentences = append(sentences, ReadConllu(contents)...)\n\t\tcontents.Close()\n\t}\n\n\treturn sentences\n}\n"
  },
  {
    "path": "treebank/treebank_test.go",
    "content": "package treebank\n\nimport (\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/chewxy/lingo\"\n\t\"github.com/stretchr/testify/assert\"\n)\n\nconst sampleConllu = `1\tPresident\tPresident\tPROPN\tNNP\tNumber=Sing\t2\tcompound\t_\t_\n2\tBush\tBush\tPROPN\tNNP\tNumber=Sing\t5\tnsubj\t_\t_\n3\ton\ton\tADP\tIN\t_\t4\tcase\t_\t_\n4\tTuesday\tTuesday\tPROPN\tNNP\tNumber=Sing\t5\tnmod\t_\t_\n5\tnominated\tnominate\tVERB\tVBD\tMood=Ind|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\ttwo\ttwo\tNUM\tCD\tNumType=Card\t7\tnummod\t_\t_\n7\tindividuals\tindividual\tNOUN\tNNS\tNumber=Plur\t5\tdobj\t_\t_\n8\tto\tto\tPART\tTO\t_\t9\tmark\t_\t_\n9\treplace\treplace\tVERB\tVB\tVerbForm=Inf\t5\tadvcl\t_\t_\n10\tretiring\tretire\tVERB\tVBG\tVerbForm=Ger\t11\tamod\t_\t_\n11\tjurists\tjurist\tNOUN\tNNS\tNumber=Plur\t9\tdobj\t_\t_\n12\ton\ton\tADP\tIN\t_\t14\tcase\t_\t_\n13\tfederal\tfederal\tADJ\tJJ\tDegree=Pos\t14\tamod\t_\t_\n14\tcourts\tcourt\tNOUN\tNNS\tNumber=Plur\t11\tnmod\t_\t_\n15\tin\tin\tADP\tIN\t_\t18\tcase\t_\t_\n16\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t18\tdet\t_\t_\n17\tWashington\tWashington\tPROPN\tNNP\tNumber=Sing\t18\tcompound\t_\t_\n18\tarea\tarea\tNOUN\tNN\tNumber=Sing\t14\tnmod\t_\t_\n19\t.\t.\tPUNCT\t.\t_\t5\tpunct\t_\t_\n\n`\n\nfunc Test_ReadConllu(t *testing.T) {\n\tassert := assert.New(t)\n\tst := ReadConllu(strings.NewReader(sampleConllu))[0]\n\n\tcorrectHeads := []int{2, 5, 4, 5, 0, 7, 5, 9, 5, 11, 9, 14, 14, 11, 18, 18, 18, 14, 5}\n\tassert.Equal(correctHeads, st.Heads)\n\n\t// we compare by string to avoid having to build two different test files\n\tvar correctPOS []string\n\tif lingo.BUILD_TAGSET == \"stanfordtags\" {\n\t\tcorrectPOS = []string{\n\t\t\t\"NNP\",\n\t\t\t\"NNP\",\n\t\t\t\"IN\",\n\t\t\t\"NNP\",\n\t\t\t\"VBD\",\n\t\t\t\"CD\",\n\t\t\t\"NNS\",\n\t\t\t\"TO\",\n\t\t\t\"VB\",\n\t\t\t\"VBG\",\n\t\t\t\"NNS\",\n\t\t\t\"IN\",\n\t\t\t\"JJ\",\n\t\t\t\"NNS\",\n\t\t\t\"IN\",\n\t\t\t\"DT\",\n\t\t\t\"NNP\",\n\t\t\t\"NN\",\n\t\t\t\"FULLSTOP\",\n\t\t}\n\t} else {\n\t\tcorrectPOS = []string{\n\t\t\t\"PROPN\",\n\t\t\t\"PROPN\",\n\t\t\t\"ADP\",\n\t\t\t\"PROPN\",\n\t\t\t\"VERB\",\n\t\t\t\"NUM\",\n\t\t\t\"NOUN\",\n\t\t\t\"PART\",\n\t\t\t\"VERB\",\n\t\t\t\"VERB\",\n\t\t\t\"NOUN\",\n\t\t\t\"ADP\",\n\t\t\t\"ADJ\",\n\t\t\t\"NOUN\",\n\t\t\t\"ADP\",\n\t\t\t\"DET\",\n\t\t\t\"PROPN\",\n\t\t\t\"NOUN\",\n\t\t\t\"PUNCT\",\n\t\t}\n\t}\n\n\tassert.Equal(correctPOS, ttos(st.Tags))\n\n\t// the stanford tags are not listed in the CONLLU format\n\tif lingo.BUILD_RELSET != \"stanfordrel\" {\n\t\tvar correctRel []string\n\t\tcorrectRel = []string{\n\t\t\t\"Compound\",\n\t\t\t\"NSubj\",\n\t\t\t\"Case\",\n\t\t\t\"NMod\",\n\t\t\t\"Root\",\n\t\t\t\"NumMod\",\n\t\t\t\"DObj\",\n\t\t\t\"Mark\",\n\t\t\t\"AdvCl\",\n\t\t\t\"AMod\",\n\t\t\t\"DObj\",\n\t\t\t\"Case\",\n\t\t\t\"AMod\",\n\t\t\t\"NMod\",\n\t\t\t\"Case\",\n\t\t\t\"Det\",\n\t\t\t\"Compound\",\n\t\t\t\"NMod\",\n\t\t\t\"Punct\",\n\t\t}\n\n\t\tassert.Equal(correctRel, ltos(st.Labels))\n\t}\n}\n\nfunc ttos(ts []lingo.POSTag) []string {\n\tretVal := make([]string, len(ts))\n\tfor i, t := range ts {\n\t\tretVal[i] = t.String()\n\t}\n\treturn retVal\n}\n\nfunc ltos(ls []lingo.DependencyType) []string {\n\tretVal := make([]string, len(ls))\n\tfor i, l := range ls {\n\t\tretVal[i] = l.String()\n\t}\n\treturn retVal\n}\n"
  },
  {
    "path": "treebank/util.go",
    "content": "package treebank\n\nimport \"github.com/chewxy/lingo\"\n\nvar alreadyLogged map[string]bool = make(map[string]bool)\n\n// TODO : CHECK\nfunc StringToLexType(tag string) lingo.LexemeType {\n\tvar lexType lingo.LexemeType\n\tswitch tag {\n\tcase \"NUM\":\n\t\tlexType = lingo.Number\n\tcase \"PUNCT\":\n\t\tlexType = lingo.Punctuation\n\tcase \"SYM\":\n\t\tlexType = lingo.Symbol\n\tdefault:\n\t\tlexType = lingo.Word\n\t}\n\treturn lexType\n}\n\nfunc StringToPOSTag(tag string) (lingo.POSTag, bool) {\n\tt, ok := posTagTable[tag]\n\n\treturn t, ok\n}\n\nfunc StringToDependencyType(ud string) (lingo.DependencyType, bool) {\n\tdt, ok := dependencyTable[ud]\n\n\treturn dt, ok\n}\n\nfunc reset() (lingo.LexemeSentence, []lingo.POSTag, []int, []lingo.DependencyType) {\n\ts := lingo.NewLexemeSentence()\n\tst := make([]lingo.POSTag, 0)\n\tsh := make([]int, 0)\n\tsdt := make([]lingo.DependencyType, 0)\n\n\treturn s, st, sh, sdt\n}\n\nfunc finish(s lingo.LexemeSentence, st []lingo.POSTag, sh []int, sdt []lingo.DependencyType, sentences []SentenceTag) []SentenceTag {\n\tsentenceTag := SentenceTag{s, st, sh, sdt}\n\tsentences = append(sentences, sentenceTag)\n\n\treturn sentences\n}\n"
  },
  {
    "path": "utils.go",
    "content": "package lingo\n\nfunc InStringSlice(s string, l []string) bool {\n\tfor _, v := range l {\n\t\tif s == v {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\ntype is func(rune) bool\n\nfunc StringIs(s string, f is) bool {\n\tfor _, c := range s {\n\t\tif !f(c) {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc isAscii(r rune) bool {\n\tif r > 255 {\n\t\treturn false\n\t}\n\treturn true\n}\n\nfunc EqStringSlice(a, b []string) bool {\n\tif len(a) != len(b) {\n\t\treturn false\n\t}\n\n\tfor i, v := range a {\n\t\tif v != b[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n"
  },
  {
    "path": "wordFlags.go",
    "content": "package lingo\n\nimport (\n\t\"fmt\"\n\t\"strings\"\n\t\"unicode\"\n)\n\n// WordFlags represent the types a word may be. A word may have multiple flags\ntype WordFlag uint32\n\nconst (\n\tNoFlag WordFlag = iota\n\tIsLetter\n\tIsAscii\n\tIsDigit\n\tIsLower\n\tIsPunct\n\tIsSpace\n\tIsTitle\n\tIsUpper\n\tLikeURL\n\tLikeNum\n\tLikeEmail\n\tIsStopWord\n\tIsOOV // for ner\n\n\tMAXFLAG\n)\n\nfunc (f WordFlag) String() string {\n\treturn fmt.Sprintf(\"%014b\", f)\n}\n\nfunc (l Lexeme) Flags() WordFlag {\n\tvar wf WordFlag\n\n\ts := l.Value\n\n\tif StringIs(s, unicode.IsLetter) {\n\t\twf |= (1 << IsLetter)\n\t}\n\n\tif StringIs(s, unicode.IsDigit) {\n\t\twf |= (1 << IsDigit)\n\t}\n\n\tif StringIs(s, isAscii) {\n\t\twf |= (1 << IsAscii)\n\t}\n\n\tif StringIs(s, unicode.IsLower) {\n\t\twf |= (1 << IsLower)\n\t}\n\n\tif StringIs(s, unicode.IsPunct) {\n\t\twf |= (1 << IsPunct)\n\t}\n\n\tif StringIs(s, unicode.IsSpace) {\n\t\twf |= (1 << IsSpace)\n\t}\n\n\tif StringIs(s, unicode.IsUpper) {\n\t\twf |= (1 << IsUpper)\n\t}\n\n\tif l.LexemeType == URI {\n\t\twf |= (1 << LikeURL)\n\t}\n\n\tif _, ok := NumberWords[strings.ToLower(s)]; ok {\n\t\twf |= (1 << LikeNum)\n\t}\n\n\tif _, ok := stopwords[s]; ok {\n\t\twf |= (1 << IsStopWord)\n\t}\n\n\tif len(s) > 0 {\n\t\tif (unicode.IsUpper(rune(s[0])) || unicode.IsTitle(rune(s[0]))) && StringIs(s[1:], unicode.IsLower) {\n\t\t\twf |= (1 << IsTitle)\n\t\t}\n\t}\n\n\treturn wf\n}\n"
  }
]