Repository: chewxy/lingo
Branch: master
Commit: 491e816b48d4
Files: 128
Total size: 278.9 KB

Directory structure:
gitextract_whqjv2y6/

├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── CONTRIBUTORS.md
├── LICENSE
├── POSTag.go
├── POSTag_stanford.go
├── POSTag_stanford_string.go
├── POSTag_universal.go
├── POSTag_universal_string.go
├── README.md
├── annotation.go
├── annotationSet.go
├── annotationSet_bench_test.go
├── browncluster.go
├── cmd/
│   ├── demo/
│   │   ├── io.go
│   │   ├── main.go
│   │   └── nlp.go
│   ├── dep/
│   │   ├── fixer.go
│   │   ├── io.go
│   │   ├── main.go
│   │   ├── pipeline.go
│   │   └── train.go
│   ├── lexer/
│   │   └── main.go
│   └── pos/
│       ├── crossvalidation.go
│       ├── fixer.go
│       └── main.go
├── const.go
├── corpus/
│   ├── consopt.go
│   ├── corpus.go
│   ├── corpus_test.go
│   ├── functions.go
│   ├── functions_test.go
│   ├── inflection.go
│   ├── inflection_test.go
│   ├── io.go
│   ├── io_test.go
│   ├── lda.go
│   ├── test_test.go
│   └── utils.go
├── dep/
│   ├── README.md
│   ├── arcStandard.go
│   ├── arcStandard_test.go
│   ├── configuration.go
│   ├── configuration_test.go
│   ├── debug.go
│   ├── dependencyParser.go
│   ├── documentation/
│   │   ├── iamhuman.dot
│   │   └── thecatsatonthemat.dot
│   ├── errors.go
│   ├── evaluation.go
│   ├── example.go
│   ├── example_test.go
│   ├── featureExtraction.go
│   ├── features.go
│   ├── features_string.go
│   ├── fix.go
│   ├── init.go
│   ├── models.go
│   ├── models_test.go
│   ├── move.go
│   ├── move_string.go
│   ├── nn2.go
│   ├── nn2_io.go
│   ├── nn2_io_test.go
│   ├── nn2_test.go
│   ├── nnconfig.go
│   ├── release.go
│   ├── span.go
│   ├── test_test.go
│   ├── train.go
│   ├── train_test.go
│   ├── transition.go
│   └── util.go
├── dependency.go
├── dependencyTree.go
├── dependencyType.go
├── dependencyType_stanford.go
├── dependencyType_stanford_string.go
├── dependencyType_universal.go
├── dependencyType_universal_string.go
├── errors.go
├── go.mod
├── go.sum
├── interfaces.go
├── io.go
├── io_test.go
├── lexeme.go
├── lexemetype_string.go
├── lexer/
│   ├── lexer.go
│   ├── lexer_test.go
│   └── stateFn.go
├── lingo.go
├── pos/
│   ├── allinone_test.go
│   ├── context.go
│   ├── context_test.go
│   ├── contexttype_string.go
│   ├── debug.go
│   ├── errors.go
│   ├── features.go
│   ├── features_test.go
│   ├── featuretype_string.go
│   ├── models.go
│   ├── models_test.go
│   ├── perceptron.go
│   ├── perceptron_io.go
│   ├── perceptron_io_test.go
│   ├── postagger.go
│   ├── release.go
│   ├── sentence.go
│   ├── test_test.go
│   ├── util.go
│   └── util_test.go
├── sentence.go
├── sets.go
├── shape.go
├── stopwords.go
├── treebank/
│   ├── const_postag_stanford.go
│   ├── const_postag_universal.go
│   ├── const_rel_stanford.go
│   ├── const_rel_universal.go
│   ├── sentenceTag.go
│   ├── sentenceTag_test.go
│   ├── treebank.go
│   ├── treebank_test.go
│   └── util.go
├── utils.go
└── wordFlags.go

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so

# Folders
_obj
_test

# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out

*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*

_testmain.go

*.exe
*.test
*.prof


================================================
FILE: .travis.yml
================================================
language: go

branches:
  only:
    - master

go:
  - 1.11.x
  - 1.12.x
  - 1.13.x
  - tip

env:
  - GO111MODULE=on

matrix:
  allow_failures:
    - go: tip


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing #

Contributors are welcome! We want to make contributing as easy as possible, and the process is very Github-centric. [Github Issues](https://github.com/chewxy/lingo/issues) are used to manage any contributions and changes. If you don't have a github account, please feel free to email me (my  user name [at] gmail.com), and I'll gladly open an issue on your behalf.

# Process #

Say you have a change you want to make, this is the process:

1. Open an issue.
2. I'll have a brief discussion with you. If you don't feel comfortable with a public discussion, I'm okay to email. 
3. Fork this project on Github, and clone it to your local machine.
4. Make your changes
5. Make sure you have tests. If you foresee breaking any API, it is vital that it be discussed beforehand.
6. Make sure your tests pass.
7. `gofmt` your code
8. Send a Pull Request.

Say you instead saw one of the [many issues](https://github.com/chewxy/lingo/issues) and want to solve one of them. This is the process:

1. Comment on the issue saying you'll pick it up. (Alternatively, email me)
2. Fork the project on Github, clone to your local drive.
3. Fork this project on Github, and clone it to your local machine.
4. Make your changes
5. Make sure you have tests. If you foresee breaking any API, it is vital that it be discussed beforehand.
6. Make sure your tests pass.
7. `gofmt` your code
8. Send a Pull Request.

## Pull Requests ##

I'll review every pull request. I may request some changes, or delve into further discussions. After that, once I'm satisfied everything passes, I'll merge the pull request. Then I'll add your name into the CONTRIBUTORS list.

# Debugging #

This package comes with a debug tag option. Most subpackages will have a `debug.go` which contain a `logf` function for logging any traces you wish to trace. 

================================================
FILE: CONTRIBUTORS.md
================================================
# Contributors #

* Xuanyi Chew (@chewxy) - initial package

================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2017 Chewxy

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: POSTag.go
================================================
package lingo

import (
	"fmt"
	"strings"
)

// POSTag represents a Part of Speech Tag.
type POSTag byte

var posTagLookup map[string]POSTag

func init() {
	posTagLookup = make(map[string]POSTag)
	for t := X; t < MAXTAG; t++ {
		s := t.String()
		posTagLookup[s] = POSTag(t)
		posTagLookup[strings.ToLower(s)] = POSTag(t)
	}
}

func (p POSTag) MarshalText() ([]byte, error) {
	return []byte(fmt.Sprintf("%v", p)), nil // add quotes back
}

func (p *POSTag) UnmarshalText(text []byte) error {
	str := strings.Trim(string(text), `"`) // for JSON use, if any
	tag, _ := posTagLookup[str]
	*p = tag
	return nil
}

// POSTag related functions
func InPOSTags(x POSTag, set []POSTag) bool {
	for _, v := range set {
		if v == x {
			return true
		}
	}
	return false
}

func IsAdjective(x POSTag) bool     { return InPOSTags(x, Adjectives) }
func IsNoun(x POSTag) bool          { return InPOSTags(x, Nouns) }
func IsProperNoun(x POSTag) bool    { return InPOSTags(x, ProperNouns) }
func IsVerb(x POSTag) bool          { return InPOSTags(x, Verbs) }
func IsAdverb(x POSTag) bool        { return InPOSTags(x, Adverbs) }
func IsInterrogative(x POSTag) bool { return InPOSTags(x, Interrogatives) }
func IsDeterminer(x POSTag) bool    { return InPOSTags(x, Determiners) }
func IsNumber(x POSTag) bool        { return InPOSTags(x, Numbers) }
func IsSymbol(x POSTag) bool        { return InPOSTags(x, Symbols) }


================================================
FILE: POSTag_stanford.go
================================================
// +build stanfordtags

package lingo

//go:generate stringer -type=POSTag -output=POSTag_stanford_string.go

const BUILD_TAGSET = "stanfordtags"

const (
	X           POSTag = iota // aka NULLTAG
	UNKNOWN_TAG               // Unknown
	ROOT_TAG                  // For Root
	CC                        // Coordinating conjunction
	CD                        // Cardinal number
	DT                        // Determiner
	EX                        // Existential there
	FW                        // Foreign word
	IN                        // Preposition or subordinating conjunction
	JJ                        // Adjective
	JJR                       // Adjective, comparative
	JJS                       // Adjective, superlative
	LS                        // List item marker
	MD                        // Modal
	NN                        // Noun, singular or mass
	NNS                       // Noun, plural
	NNP                       // Proper noun, singular
	NNPS                      // Proper noun, plural
	PDT                       // Predeterminer
	POS                       // Possessive ending
	PRP                       // Personal pronoun
	PPRP                      // Possessive pronoun (PRP$)
	RB                        // Adverb
	RBR                       // Adverb, comparative
	RBS                       // Adverb, superlative
	RP                        // Particle
	SYM                       // Symbol
	TO                        // to
	UH                        // Interjection
	VB                        // Verb, base form
	VBD                       // Verb, past tense
	VBG                       // Verb, gerund or present participle
	VBN                       // Verb, past participle
	VBP                       // Verb, non-3rd person singular present
	VBZ                       // Verb, 3rd person singular present
	WDT                       // Wh-determiner
	WP                        // Wh-pronoun
	PWP                       // Possessive wh-pronoun (WP$)
	WRB                       // Wh-adverb

	// Punctuation related stuff: http://stackoverflow.com/a/21546294
	COMMA      // Obvious isn't it?
	FULLSTOP   // fullstop
	OPENQUOTE  // Penn Treebank uses ``
	CLOSEQUOTE // Penn Treebank uses ''
	COLON
	DOLLAR
	HASHSIGN
	LEFTBRACE
	RIGHTBRACE

	// Extensions for web shit: https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/etb-supplementary-guidelines-2009-addendum.pdf
	// http://clear.colorado.edu/compsem/documents/treebank_guidelines.pdf
	HYPH // Hyphen in split compounds
	AFX  // affix
	ADD  // url or email addy
	NFP  // superfluous (non final) puncutation
	GW   // Goes WIth
	XX   // deidentified data (aka giberish)

	MAXTAG
)

// POSTagShortcut is a shortcut function to help the POSTagger shortcircuit some decisions about what the tag is
func POSTagShortcut(l Lexeme) (POSTag, bool) {
	switch l.LexemeType {
	case Number:
		return CD, true
	case Punctuation:
		switch l.Value {
		case ",":
			return COMMA, true
		case ".":
			return FULLSTOP, true
		case "``":
			return OPENQUOTE, true
		case "''":
			return CLOSEQUOTE, true
		case ":":
			return COLON, true
		case "#":
			return HASHSIGN, true
		case "(":
			return LEFTBRACE, true
		case ")":
			return RIGHTBRACE, true
		default:
			return X, false
		}
	case Symbol:
		return SYM, true
	case URI:
		return ADD, true
	case Date:
		return CD, true
	case Time:
		return CD, true
	case EOF:
		return X, true
	}
	return X, false
}

// sets

var Adjectives = []POSTag{JJ, JJR, JJS}
var Nouns = []POSTag{NN, NNP, NNS, NNPS}
var ProperNouns = []POSTag{NNP, NNPS}
var Verbs = []POSTag{VB, VBD, VBG, VBN, VBP, VBZ}
var Adverbs = []POSTag{RB, RBR, RBS}
var Determiners = []POSTag{DT, PDT}
var Interrogatives = []POSTag{WDT, WP, PWP, WRB}
var Numbers = []POSTag{CD}
var Symbols = []POSTag{SYM, FULLSTOP, COMMA, OPENQUOTE, COLON, DOLLAR, HASHSIGN, LEFTBRACE, RIGHTBRACE, HYPH, NFP}

// IsIN returns true if the POSTag is a subordinating conjunction.
// The reason why this exists is because in the stanford tag, IN is the POSTag
// while in the universal dependencies, it's the SCONJ POSTag
func IsIN(x POSTag) bool { return x == IN }


================================================
FILE: POSTag_stanford_string.go
================================================
// +build stanfordtags

// Code generated by "stringer -type=POSTag -output=POSTag_stanford_string.go"; DO NOT EDIT

package lingo

import "fmt"

const _POSTag_name = "XUNKNOWN_TAGROOT_TAGCCCDDTEXFWINJJJJRJJSLSMDNNNNSNNPNNPSPDTPOSPRPPPRPRBRBRRBSRPSYMTOUHVBVBDVBGVBNVBPVBZWDTWPPWPWRBCOMMAFULLSTOPOPENQUOTECLOSEQUOTECOLONDOLLARHASHSIGNLEFTBRACERIGHTBRACEHYPHAFXADDNFPGWXXMAXTAG"

var _POSTag_index = [...]uint8{0, 1, 12, 20, 22, 24, 26, 28, 30, 32, 34, 37, 40, 42, 44, 46, 49, 52, 56, 59, 62, 65, 69, 71, 74, 77, 79, 82, 84, 86, 88, 91, 94, 97, 100, 103, 106, 108, 111, 114, 119, 127, 136, 146, 151, 157, 165, 174, 184, 188, 191, 194, 197, 199, 201, 207}

func (i POSTag) String() string {
	if i >= POSTag(len(_POSTag_index)-1) {
		return fmt.Sprintf("POSTag(%d)", i)
	}
	return _POSTag_name[_POSTag_index[i]:_POSTag_index[i+1]]
}


================================================
FILE: POSTag_universal.go
================================================
// +build !stanfordtags

package lingo

//go:generate stringer -type=POSTag -output=POSTag_universal_string.go

const BUILD_TAGSET = "universaltags"

const (
	X POSTag = iota // aka NULLTAG
	UNKNOWN_TAG
	ROOT_TAG
	ADJ
	ADP
	ADV
	AUX
	CONJ
	DET
	INTJ
	NOUN
	NUM
	PART
	PRON
	PROPN
	PUNCT
	SCONJ
	SYM
	VERB

	MAXTAG // MAXTAG is provided here as index support
)

// POSTagShortcut is a shortcut function to help the POSTagger shortcircuit some decisions about what the tag is
func POSTagShortcut(l Lexeme) (POSTag, bool) {
	switch l.LexemeType {
	case Number:
		return NUM, true
	case Punctuation:
		return PUNCT, true
	case Symbol:
		return SYM, true
	case URI:
		return X, true
	case Date:
		return NUM, true
	case Time:
		return NUM, true
	case EOF:
		return X, true
	}
	return X, false
}

var Adjectives = []POSTag{ADJ}
var Nouns = []POSTag{NOUN, PROPN}
var ProperNouns = []POSTag{PROPN}
var Verbs = []POSTag{VERB}
var Adverbs = []POSTag{ADV}
var Determiners = []POSTag{DET}
var Interrogatives = []POSTag{PRON, DET, ADV}
var Numbers = []POSTag{NUM}
var Symbols = []POSTag{SYM, PUNCT}

// IsIN returns true if the POSTag is a subordinating conjunction.
// The reason why this exists is because in the stanford tag, IN is the POSTag
// while in the universal dependencies, it's the SCONJ POSTag
func IsIN(x POSTag) bool { return x == SCONJ }


================================================
FILE: POSTag_universal_string.go
================================================
// +build !stanfordtags

// Code generated by "stringer -type=POSTag -output=POSTag_universal_string.go"; DO NOT EDIT

package lingo

import "fmt"

const _POSTag_name = "XUNKNOWN_TAGROOT_TAGADJADPADVAUXCONJDETINTJNOUNNUMPARTPRONPROPNPUNCTSCONJSYMVERBMAXTAG"

var _POSTag_index = [...]uint8{0, 1, 12, 20, 23, 26, 29, 32, 36, 39, 43, 47, 50, 54, 58, 63, 68, 73, 76, 80, 86}

func (i POSTag) String() string {
	if i >= POSTag(len(_POSTag_index)-1) {
		return fmt.Sprintf("POSTag(%d)", i)
	}
	return _POSTag_name[_POSTag_index[i]:_POSTag_index[i+1]]
}


================================================
FILE: README.md
================================================
# lingo #

<img src="https://raw.githubusercontent.com/chewxy/lingo/master/media/gopher_small.png" align="right" />

[![Build Status](https://travis-ci.org/chewxy/lingo.svg?branch=master)](https://travis-ci.org/chewxy/lingo)

package `lingo` provides the data structures and algorithms required for natural language processing.

Specifically, it provides a POS Tagger (`lingo/pos`), a Dependency Parser (`lingo/dep`), and a basic tokenizer (`lingo/lexer`) for English. It also provides data structures for holding corpuses (`lingo/corpus`), and treebanks (`lingo/treebank`).

The aim of this package is to provide a production quality pipeline for natural language processing.

# Install #

The package is go-gettable: `go get -u github.com/chewxy/lingo`

This package and its subpackages depend on very few external packages. Here they are:

| Package | Used For | Vitality | Notes | Licence |
|---------|----------|----------|-------|---------|
| [gorgonia](https://github.com/chewxy/gorgonia) | Machine learning | Vital. It won't be hard to rewrite them, but why? | Same author | [Gorgonia Licence](https://github.com/chewxy/gorgonia/blob/master/LICENSE) (Apache 2.0-like) |
| [gographviz](https://github.com/awalterschulze/gographviz) | Visualization of annotations, and other graph-related visualizations | Vital for visualizations, which are a nice-to-have feature | API last changed 12th April 2017 | [gographviz licence](https://github.com/awalterschulze/gographviz/blob/master/LICENSE) (Apache 2.0) |
| [errors](https://github.com/pkg/errors)  | Errors   | The package won't die without it, but it's a very nice to have | Stable API for the past year | [errors licence](https://github.com/pkg/errors/blob/master/LICENSE) (MIT/BSD like) |
| [set](https://github.com/xtgo/set) | Set operations | Can be easily replaced | Stable API for the past year | [set licence](https://github.com/xtgo/set/blob/master/LICENSE) (MIT/BSD-like) |

# Usage #

See the individual packages for usage. There is also a bunch of executables in the `cmd` directory. They're meant to be examples as to how a natural language processing pipeline can be set up.

A natural language pipeline with this package is heavily channels driven. Here's is an example for dependency parsing:

```go
func main() {
	inputString: `The cat sat on the mat`
	lx := lexer.New("dummy", strings.NewReader(inputString)) // lexer - required to break a sentence up into words.
	pt := pos.New(pos.WithModel(posModel))                   // POS Tagger - required to tag the words with a part of speech tag.
	dp := dep.New(depModel)                                  // Creates a new parser

	// set up a pipeline
	pt.Input = lx.Output
	dp.Input = pt.Output

	// run all
	go lx.Run()
	go pt.Run()
	go dp.Run()

	// wait to receive:
	for {
		select {
		case d := <- dp.Output:
			// do something
		case err:= <-dp.Error:
			// handle error
		}
	}

}

```


# How It Works #
For specific tasks (POS tagging, parsing, named entity recognition etc), refer to the README of each subpackage. This package on its own mainly provides the data structures that the subpackages will use.

Perhaps the most important data structure is the `*Annotation` structure. It basically holds a word and the associated metadata for the word.

For dependency parses, the graph takes three forms: `*Dependency`, `*DependencyTree` and `*Annotation`. All three forms are convertable from one to another. TODO: explain rationale behind each data type.

## Quirks ##

### Very Oddly Specific POS Tags and Dependency Rel Types ###

A particular quirk you may have noticed is that the `POSTag` and `DependencyType` are hard coded in as constants. This package does in fact provide two variations of each: one from Stanford/Penn Treebank and one from [UniversalDependencies](http://universaldependencies.org/).

The main reason for hardcoding these are mainly for performance reasons - knowing ahead how much to allocate reduces a lot of additional work the program has to do. It also reduces the chances of mutating a global variable.

Of course this comes as a tradeoff - programs are limited to these two options. Thankfully there are only a limited number of POS Tag and Dependency Relation types. Two of the most popular ones (Stanford/PTB and Universal Dependencies) have been implemented.

The following build tags are supported:

* stanfordtags
* universaltags
* stanfordrel
* universalrel

To use a specific tagset or relset, build your program thusly: `go build -tags='stanfordtags'`.

The default tag and dependency rel types are the universal dependencies version.

### Lexer ###

You should also note that the tokenizer, `lingo/lexer` is not your usual run-of-the-mill NLP tokenizer. It's a tokenizer that tokenizes by space, with some specific rules for English. It was inspired by Rob Pike's talk on lexers. I thought it'd be cool to write something like that for NLP.

The test cases in package `lingo/lexer` showcases how it handles unicode, and other pathalogical english.

# Contributing #
see CONTRIBUTING.md for more info

# Licence #

This package is licenced under the MIT licence.


================================================
FILE: annotation.go
================================================
package lingo

import (
	"errors"
	"fmt"
	"strings"
)

// Annotation is the word and it's metadata.
// This includes the position, its dependency head (if available), its lemma, POSTag, etc
//
// A collection of Annoations - AnnotatedSentence is also a representation of a dependency parse
//
// Every field is exported for easy gobbing. be very careful with setting stuff
type Annotation struct {
	Lexeme
	POSTag
	// NER

	// fields to do with an annotation being in a collection
	DependencyType
	ID       int
	Head     *Annotation
	children AnnotationSet //will not be serialized

	// info about the annotation itself
	Lemma   string
	Lowered string
	Stem    string

	// auxiliary data for processing
	Cluster
	Shape
	WordFlag
}

func NewAnnotation() *Annotation {
	return &Annotation{
		Lexeme: nullLexeme,
		Lemma:  "",
		Shape:  Shape(""),
	}
}

// AnnotationFromLexTag is only ever used in tests. Fixer is optional
func AnnotationFromLexTag(l Lexeme, t POSTag, f AnnotationFixer) *Annotation {
	a := &Annotation{
		Lexeme:         l,
		POSTag:         t,
		DependencyType: NoDepType,
		Lemma:          "",
		Lowered:        strings.ToLower(l.Value),
	}

	// it's ok to panic - it will cause the tests to fail
	if err := a.Process(f); err != nil {
		panic(err)
	}

	return a
}

func (a *Annotation) Clone() *Annotation {
	b := *a
	b.ID = -1
	b.Head = nil
	b.children = nil
	b.DependencyType = NoDepType

	return &b
}

func (a *Annotation) SetHead(headAnn *Annotation) {
	a.Head = headAnn
	if headAnn != rootAnnotation && headAnn != startAnnotation && headAnn != nullAnnotation {
		headAnn.children = append(headAnn.children, a)
	}
}

func (a *Annotation) HeadID() int {
	if a.Head != nil {
		return a.Head.ID
	}
	return -1
}

func (a *Annotation) IsNumber() bool {
	return IsNumber(a.POSTag) && (a.LexemeType != Date && a.LexemeType != Time && a.LexemeType != URI)
}

func (a *Annotation) String() string {
	return a.Value
}

func (a *Annotation) GoString() string {
	s := fmt.Sprintf("%q/%s", a.Lexeme.Value, a.POSTag)

	if a.Head != nil {
		return fmt.Sprintf("(%v) <-%v- (%q/%s) ", s, a.DependencyType, a.Head.Value, a.Head.POSTag)
	}
	return s
}

func (a *Annotation) Process(f AnnotationFixer) error {
	if a.Lexeme != nullLexeme {
		a.Lowered = strings.ToLower(a.Value)
		a.Shape = a.Lexeme.Shape()
		a.WordFlag = a.Lexeme.Flags()

		var err error
		if f != nil {
			var stem string
			if stem, err = f.Stem(a.Lowered); err != nil {
				if _, ok := err.(componentUnavailable); !ok {
					return err
				}
			}
			a.Stem = stem

			var clust map[string]Cluster
			if clust, err = f.Clusters(); err == nil {
				a.Cluster = clust[a.Value]
			}
		}

		return nil
	}
	return errors.New("No Lexeme!")
}

var rootAnnotation = &Annotation{
	Lexeme:         rootLexeme,
	POSTag:         ROOT_TAG,
	DependencyType: Root,
	ID:             0,
	Head:           nil,
	Lemma:          "",
	Lowered:        "",
	Cluster:        0,
	Shape:          "",
	WordFlag:       NoFlag,
}

var startAnnotation = &Annotation{
	Lexeme:         startLexeme,
	POSTag:         ROOT_TAG,
	DependencyType: NoDepType,
	ID:             -1,
	Head:           nil,
	Lemma:          "",
	Lowered:        "",
	Cluster:        0,
	Shape:          "",
	WordFlag:       NoFlag,
}

var nullAnnotation = &Annotation{
	Lexeme:         nullLexeme,
	POSTag:         X,
	DependencyType: NoDepType,
	ID:             -1,
	Head:           nil,
	Lemma:          "",
	Lowered:        "",
	Cluster:        0,
	Shape:          "",
	WordFlag:       NoFlag,
}

func RootAnnotation() *Annotation  { return rootAnnotation }
func StartAnnotation() *Annotation { return startAnnotation }
func NullAnnotation() *Annotation  { return nullAnnotation }

func StringToAnnotation(s string, f AnnotationFixer) *Annotation {
	l := MakeLexeme(s, Word)
	a := NewAnnotation()
	a.Lexeme = l
	if err := a.Process(f); err != nil {
		panic(err.Error())
	}
	return a
}

type AnnotationFixer interface {
	Lemmatizer
	Stemmer
	Clusters() (map[string]Cluster, error)
}


================================================
FILE: annotationSet.go
================================================
package lingo

import (
	"sort"
	"unsafe"

	"github.com/xtgo/set"
)

type AnnotationSet []*Annotation

func (as AnnotationSet) Len() int      { return len(as) }
func (as AnnotationSet) Swap(i, j int) { as[i], as[j] = as[j], as[i] }
func (as AnnotationSet) Less(i, j int) bool {
	return uintptr(unsafe.Pointer(as[i])) < uintptr(unsafe.Pointer(as[j]))
}

func (as AnnotationSet) Set() AnnotationSet {
	sort.Sort(as)
	n := set.Uniq(as)
	return as[:n]
}

func (as AnnotationSet) Contains(a *Annotation) bool {
	if as.Index(a) == len(as) {
		return false
	}
	return true
}

func (as AnnotationSet) Index(a *Annotation) int {
	for i, an := range as {
		if an == a {
			return i
		}
	}
	return len(as)
}

func (as AnnotationSet) Add(a *Annotation) AnnotationSet {
	if as.Contains(a) {
		return as
	}
	as = append(as, a)
	return as
}


================================================
FILE: annotationSet_bench_test.go
================================================
package lingo

import (
	"sort"
	"testing"
)

func (as AnnotationSet) index2(a *Annotation) int {
	sort.Sort(as)
	f := func(i int) bool { return as[i] == a }
	return sort.Search(len(as), f)
}

var benchIndexRes int

func benchASIndex(size int, b *testing.B) {
	var as AnnotationSet
	for i := 0; i < size; i++ {
		as = append(as, new(Annotation))
	}

	doesntcontain := new(Annotation)
	contains := as[0]

	for n := 0; n < b.N; n++ {
		benchIndexRes = as.Index(doesntcontain)
		benchIndexRes = as.Index(contains)
	}
}

func benchASIndex2(size int, b *testing.B) {
	var as AnnotationSet
	for i := 0; i < size; i++ {
		as = append(as, new(Annotation))
	}

	doesntcontain := new(Annotation)
	contains := as[0]

	for n := 0; n < b.N; n++ {
		benchIndexRes = as.index2(doesntcontain)
		benchIndexRes = as.index2(contains)
	}
}

func BenchmarkAnnotationSetIndex_1(b *testing.B)    { benchASIndex(1, b) }
func BenchmarkAnnotationSetIndex_2(b *testing.B)    { benchASIndex(2, b) }
func BenchmarkAnnotationSetIndex_8(b *testing.B)    { benchASIndex(8, b) }
func BenchmarkAnnotationSetIndex_16(b *testing.B)   { benchASIndex(16, b) }
func BenchmarkAnnotationSetIndex_32(b *testing.B)   { benchASIndex(32, b) }
func BenchmarkAnnotationSetIndex_64(b *testing.B)   { benchASIndex(64, b) }
func BenchmarkAnnotationSetIndex_128(b *testing.B)  { benchASIndex(128, b) }
func BenchmarkAnnotationSetIndex_256(b *testing.B)  { benchASIndex(256, b) }
func BenchmarkAnnotationSetIndex_512(b *testing.B)  { benchASIndex(512, b) }
func BenchmarkAnnotationSetIndex_1024(b *testing.B) { benchASIndex(1024, b) }

func BenchmarkAnnotationSetIndex2_1(b *testing.B)    { benchASIndex2(1, b) }
func BenchmarkAnnotationSetIndex2_2(b *testing.B)    { benchASIndex2(2, b) }
func BenchmarkAnnotationSetIndex2_8(b *testing.B)    { benchASIndex2(8, b) }
func BenchmarkAnnotationSetIndex2_16(b *testing.B)   { benchASIndex2(16, b) }
func BenchmarkAnnotationSetIndex2_32(b *testing.B)   { benchASIndex2(32, b) }
func BenchmarkAnnotationSetIndex2_64(b *testing.B)   { benchASIndex2(64, b) }
func BenchmarkAnnotationSetIndex2_128(b *testing.B)  { benchASIndex2(128, b) }
func BenchmarkAnnotationSetIndex2_256(b *testing.B)  { benchASIndex2(256, b) }
func BenchmarkAnnotationSetIndex2_512(b *testing.B)  { benchASIndex2(512, b) }
func BenchmarkAnnotationSetIndex2_1024(b *testing.B) { benchASIndex2(1024, b) }


================================================
FILE: browncluster.go
================================================
package lingo

import (
	"bufio"
	"io"
	"strconv"
	"strings"
)

// this file provides IO support and type safety for brown clusters.
// The creation of brownclusters is not done here.
// Right now lingo does not generate clusters - use PercyLiang's excellent tool for that

// Cluster represents a brown cluster
type Cluster int

// ReadCluster reads PercyLiang's cluster file format and returns a map of strings to Cluster
func ReadCluster(r io.Reader) map[string]Cluster {
	scanner := bufio.NewScanner(r)
	clusters := make(map[string]Cluster)

	for scanner.Scan() {
		line := scanner.Text()

		splits := strings.Split(line, "\t")
		var word string
		var cluster, freq int

		word = splits[1]

		var i64 int64
		var err error
		if i64, err = strconv.ParseInt(splits[0], 2, 64); err != nil {
			panic(err)
		}
		cluster = int(i64)

		if freq, err = strconv.Atoi(splits[2]); err != nil {
			panic(err)
		}

		// if clusterer has only seen a word a few times, then the cluster is not reliable
		if freq >= 3 {
			clusters[word] = Cluster(cluster)
		} else {
			clusters[word] = Cluster(0)
		}
	}

	// expand clusters with recasing
	for word, clust := range clusters {
		lowered := strings.ToLower(word)
		if _, ok := clusters[lowered]; !ok {
			clusters[lowered] = clust
		}

		titled := strings.ToTitle(word)
		if _, ok := clusters[titled]; !ok {
			clusters[titled] = clust
		}

		uppered := strings.ToUpper(word)
		if _, ok := clusters[uppered]; !ok {
			clusters[uppered] = clust
		}
	}

	return clusters
}


================================================
FILE: cmd/demo/io.go
================================================
package main

import (
	"log"
	"os"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/dep"
	"github.com/chewxy/lingo/pos"
)

const (
	posModelFile = `model/pos_stanfordtags_universalrel.final.model`
	depModelFile = `model/dep_stanfordtags_universalrel.final.model`
	brownCluster = `clusters.txt`
)

func io() {
	var err error
	log.Println("loading POS Tagger model")
	if posModel, err = pos.Load(posModelFile); err != nil {
		log.Fatal(err)
	}

	log.Println("loading Dependency Parser model")
	if depModel, err = dep.Load(depModelFile); err != nil {
		log.Fatal(err)
	}
	var f *os.File
	if f, err = os.Open(brownCluster); err != nil {
		log.Fatal(err)
	}
	clusters = lingo.ReadCluster(f)
}


================================================
FILE: cmd/demo/main.go
================================================
package main

import (
	"io/ioutil"
	"os"
	"os/exec"

	"github.com/abiosoft/ishell"
	"github.com/chewxy/lingo"
	"github.com/pkg/browser"
)

func main() {
	io()
	shell := ishell.New()

	var d *lingo.Dependency
	// var sent lingo.AnnotatedSentence
	var err error
	shell.AddCmd(&ishell.Cmd{
		Name: "dep",
		Help: "perform dependency parsing",
		Func: func(c *ishell.Context) {
			c.ShowPrompt(false)
			defer c.ShowPrompt(true)

			c.Print("Query: ")
			query := c.ReadLine()

			if d, err = pipeline(query); err != nil {
				c.Printf("Error: %v", err)
			}

			c.Printf("%v\n", d)
		},
	})

	shell.AddCmd(&ishell.Cmd{
		Name: "show",
		Help: "show dependency parse on browser",
		Func: func(c *ishell.Context) {
			var tmp *os.File
			if tmp, err = ioutil.TempFile("", "dep"); err != nil {
				c.Printf("Cannot open file %v\n", err)
				return
			}
			defer os.Remove(tmp.Name())

			c.Printf("%v\n", tmp.Name())

			dot := d.Tree().Dot()
			tmp.Write([]byte(dot))
			if err := tmp.Close(); err != nil {
				c.Printf("Error closing file %v", err)
			}
			cmd := exec.Command("dot", "-Tpng", "-O", tmp.Name())
			if err = cmd.Run(); err != nil {
				c.Printf("Cannot execute dot: %v\n", err)
			}

			browser.OpenFile(tmp.Name() + ".png")

		},
	})
	shell.Start()
}


================================================
FILE: cmd/demo/nlp.go
================================================
package main

import (
	"fmt"
	"strings"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/dep"
	"github.com/chewxy/lingo/lexer"
	"github.com/chewxy/lingo/pos"
	"github.com/kljensen/snowball"
	"github.com/pkg/errors"
)

var posModel *pos.Model
var depModel *dep.Model

var clusters map[string]lingo.Cluster

type stemmer struct{}

func (stemmer) Stem(a string) (string, error) {
	return snowball.Stem(a, "english", true)
}

type fixer struct {
	stemmer
}

func (f fixer) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil }
func (f fixer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) {
	return nil, nocomp("lemmatizer")
}

type nocomp string

func (e nocomp) Error() string     { return fmt.Sprintf("no %v", string(e)) }
func (e nocomp) Component() string { return string(e) }

func pipeline(s string) (d *lingo.Dependency, err error) {
	if posModel == nil || depModel == nil {
		return nil, errors.Errorf("Unable to create a pipeline")
	}
	lx := lexer.New(s, strings.NewReader(s))
	pt := pos.New(pos.WithModel(posModel), pos.WithStemmer(stemmer{}))
	dp := dep.New(depModel)

	// pipeline
	pt.Input = lx.Output
	dp.Input = pt.Output

	go lx.Run()
	go pt.Run()
	go dp.Run()

	var ok bool
	for {
		select {
		case d, ok = <-dp.Output:
			if !ok {
				continue
			}
			return
		case err = <-dp.Error:
			return
		}
	}
}


================================================
FILE: cmd/dep/fixer.go
================================================
package main

import (
	"fmt"

	"github.com/chewxy/lingo"
	"github.com/kljensen/snowball"
)

type stemmer struct{}

func (stemmer) Stem(a string) (string, error) {
	return snowball.Stem(a, "english", true)
}

type fixer struct {
	stemmer
}

func (f fixer) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil }
func (f fixer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) {
	return nil, nocomp("lemmatizer")
}

type nocomp string

func (e nocomp) Error() string     { return fmt.Sprintf("no %v", string(e)) }
func (e nocomp) Component() string { return string(e) }


================================================
FILE: cmd/dep/io.go
================================================
package main

import (
	"log"

	"github.com/chewxy/lingo/dep"
	"github.com/chewxy/lingo/pos"
	"github.com/chewxy/lingo/treebank"
)

func validateFlags() {
	if *load == "" && *trainFile == "" {
		log.Fatal("Must either load a model or pass in a training file")
	}

	if *epoch < 0 {
		log.Fatal("epochs must only be positive numbers")
	}

	if *load != "" {
		toLoad = true
	}

	if *trainFile != "" {
		toTrain = true
	}

	if *testFile != "" {
		*cv = true
	}

	// warnings
	if *load == "" && *save == "" {
		log.Println("WARNING: Models that have been trained will NOT be saved")
	}
}

func loadTreebanks() {
	if *trainFile != "" {
		trainTB = treebank.LoadUniversal(*trainFile)
	}

	if *testFile != "" {
		testTB = treebank.LoadUniversal(*testFile)
	}
}

func loadPOSModel() {
	var err error
	if *loadPOS == "" {
		log.Fatal("Cannot proceed without having a POS model")
	}
	if POSModel, err = pos.Load(*loadPOS); err != nil {
		log.Fatal(err)
	}
}

func loadDepModel() {
	var err error

	if DepModel, err = dep.Load(*load); err != nil {
		log.Fatal(err)
	}
}

func saveModel() {
	if *save != "" && DepModel != nil {
		DepModel.Save(*save)
	}
}


================================================
FILE: cmd/dep/main.go
================================================
package main

import (
	"flag"
	"log"
	"os"
	"os/signal"
	"runtime/pprof"
	"syscall"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/dep"
	"github.com/chewxy/lingo/pos"
)

var save = flag.String("save", "", "save as...")
var load = flag.String("load", "", "load a model")
var loadPOS = flag.String("PTmodel", "", "load a POS Tagger model")
var clusterFiles = flag.String("cluster", "", "Brown Cluster files. If nothing is passed in, then the brown cluster won't be used")
var trainFile = flag.String("train", "", "Training on... (Only CONLLU formatted training files are accepted)")
var testFile = flag.String("test", "", "Test on... (Only CONLLU formatted training files are accepted). If this is not provided, the model will be trained without crossvalidation")
var cv = flag.Bool("cv", false, "Cross validate training model? Defaults to false.")
var epoch = flag.Int("epoch", 10, "Training epochs. Defaults to 10")
var format = flag.String("f", "", "Format to output. Default is none. Accepts: {json, dot}")

var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
var memprofile = flag.String("memprofile", "", "write memory profile to this file")

var clusters map[string]lingo.Cluster
var POSModel *pos.Model
var DepModel *dep.Model
var toLoad, toTrain bool

func init() {
	if lingo.BUILD_TAGSET != "stanfordtags" && lingo.BUILD_TAGSET != "universaltags" {
		log.Fatalf("Tagset %q unsupported", lingo.BUILD_TAGSET)
	}

	if lingo.BUILD_RELSET != "stanfordrel" && lingo.BUILD_RELSET != "universalrel" {
		log.Fatalf("Relset %q unsupported", lingo.BUILD_RELSET)
	}
}

func cleanup(sigChan chan os.Signal, cpuprofiling, memprofiling bool) {
	select {
	case <-sigChan:
		log.Println("EMERGENCY EXIT")
		if cpuprofiling {
			pprof.StopCPUProfile()

		}
		if memprofiling {
			f, err := os.Create(*memprofile)
			if err != nil {
				log.Fatal(err)
			}
			pprof.WriteHeapProfile(f)
			f.Close()
		}
		saveModel()
		os.Exit(1)
	}
}

func main() {
	flag.Parse()
	validateFlags()

	sigChan := make(chan os.Signal, 1)
	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
	var cpuprofiling, memprofiling bool
	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal(err)
		}
		cpuprofiling = true
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}

	if *memprofile != "" {
		memprofiling = true
	}

	go cleanup(sigChan, cpuprofiling, memprofiling)

	loadPOSModel()
	if toLoad {
		loadDepModel()
	}

	if toTrain {
		loadTreebanks()
		train()
	}

	saveModel()
}


================================================
FILE: cmd/dep/pipeline.go
================================================
package main

import (
	"encoding/json"
	"fmt"
	"strings"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/dep"
	"github.com/chewxy/lingo/lexer"
	"github.com/chewxy/lingo/pos"
)

func receive(deps chan *lingo.Dependency, errs, errChan chan error) {
	defer close(errChan)
	for {
		select {
		case dep, ok := <-deps:
			if !ok {
				continue
			}
			switch *format {
			case "json":
				bs, _ := json.MarshalIndent(dep, "", "\t")
				fmt.Printf("%s\n", string(bs))
			case "dot":
				fmt.Printf("%v\n", dep.Tree().Dot())
			}

		case err := <-errs:
			errChan <- err
		}
	}
}

func pipeline(s string) error {
	lx := lexer.New(s, strings.NewReader(s))
	pt := pos.New(pos.WithModel(POSModel))
	dp := dep.New(DepModel)

	pt.Input = lx.Output
	dp.Input = pt.Output

	errChan := make(chan error)
	go lx.Run()
	go pt.Run()
	go receive(dp.Output, dp.Error, errChan)
	dp.Run()

	return <-errChan
}


================================================
FILE: cmd/dep/train.go
================================================
package main

import (
	"log"

	"github.com/chewxy/lingo/dep"
	"github.com/chewxy/lingo/treebank"
	"gorgonia.org/tensor"
)

var trainTB []treebank.SentenceTag
var testTB []treebank.SentenceTag

func train() {
	conf := dep.DefaultNNConfig
	conf.Dtype = tensor.Float32
	var trainer *dep.Trainer

	if testTB != nil {
		log.Printf("TRAINING WITH CROSSVALIDATION")
		trainer = dep.NewTrainer(dep.WithGeneratedCorpus(trainTB...), dep.WithTrainingSet(trainTB), dep.WithCrossValidationSet(testTB), dep.WithConfig(conf))
		trainer.SaveBest = "TMP.model"
		if err := trainer.Init(); err != nil {
			log.Fatalf("Unable to initialize trainer: \n%+v", err)
		}

		prog := trainer.Perf()
		cost := trainer.Cost()
		go func() {
			for {
				select {
				case p := <-prog:
					log.Printf("%v\n", p)
				case c := <-cost:
					log.Printf("Cost %v\n", c)
				}
			}
		}()

	} else {
		trainer = dep.NewTrainer(dep.WithGeneratedCorpus(trainTB...), dep.WithTrainingSet(trainTB), dep.WithConfig(conf))
		if err := trainer.Init(); err != nil {
			log.Fatalf("Unable to initialize trainer: \n%+v", err)
		}

		prog := trainer.Cost()
		go func() {
			for cost := range prog {
				log.Printf("Cost %v\n", cost)
			}
		}()
	}

	if err := trainer.Train(*epoch); err != nil {
		log.Fatal(err)
	}

	DepModel = trainer.Model
}


================================================
FILE: cmd/lexer/main.go
================================================
package main

import (
	"flag"
	"fmt"
	"strings"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/lexer"
)

var input = flag.String("input", "", "input string to lex")
var output = make(chan lingo.Lexeme)

func receieve() {
	for l := range output {
		fmt.Printf("%v\n", l)
	}
}

func main() {
	flag.Parse()

	s := *input

	go receieve()
	l := lexer.New(s, strings.NewReader(s))
	l.Output = output
	l.Run()
}


================================================
FILE: cmd/pos/crossvalidation.go
================================================
package main

import (
	"bytes"
	"fmt"
	"log"
	"os"
	"strings"
	"sync"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/lexer"
	"github.com/chewxy/lingo/pos"
	"github.com/chewxy/lingo/treebank"
)

type testResult struct {
	tagged lingo.AnnotatedSentence
	actual lingo.AnnotatedSentence
}

func (tr testResult) compare() (int, bool) {
	tagged := tr.tagged
	actual := tr.actual

	var sameLength bool = true

	if len(tagged) != len(actual) {
		sameLength = false
	}

	var counter int
	for i, v := range actual {
		if i >= len(tagged) {
			break
		}
		if v.POSTag == tagged[i].POSTag {
			counter++
		}
	}
	return counter, sameLength
}

func crossValidate(resultChan chan testResult) {
	diffLengthCount := 0
	totalLength := 0
	correctCount := 0
	sentences := 0

	var wrongResults []testResult

	for res := range resultChan {
		sentences++
		length := len(res.actual)
		cc, sl := res.compare()
		if !sl {
			diffLengthCount++
		}
		correctCount += cc
		totalLength += length

		if cc != length && *inspect != "" {
			wrongResults = append(wrongResults, res)
		}
	}

	if *inspect != "" {
		f, err := os.OpenFile(*inspect, os.O_WRONLY|os.O_CREATE, 0666)
		if err != nil {
			log.Fatal(err)
		}

		// can write directly to f
		var buf bytes.Buffer
		for _, res := range wrongResults {
			fmt.Fprintf(&buf, "Sentence: \nW:%v\nG:%v\nTags:\nW: %v\nG: %v\n\n", res.actual.StringSlice(), res.tagged.StringSlice(), res.actual.Tags(), res.tagged.Tags())
		}

		f.WriteString(buf.String())
		f.Close()
	}

	fmt.Printf("CrossValidation: %d/%d = %f. Differing Lengths : %d/%d = %f\n", correctCount, totalLength, float64(correctCount)/float64(totalLength), diffLengthCount, sentences, float64(diffLengthCount)/float64(sentences))
}

func collect(ch chan lingo.AnnotatedSentence, correct lingo.AnnotatedSentence, outCh chan testResult, wg *sync.WaitGroup) {
	defer wg.Done()

	for sentence := range ch {
		outCh <- testResult{sentence, correct}
	}
}

func testModel(sentences []treebank.SentenceTag) {
	resultChan := make(chan testResult)

	go func() {
		defer close(resultChan)
		var wg sync.WaitGroup
		for _, sentence := range sentences {
			wg.Add(1)
			input := sentence.String()
			correct := sentence.AnnotatedSentence(fixer{stemmer{}})
			ch := make(chan lingo.AnnotatedSentence)
			go collect(ch, correct, resultChan, &wg)
			go cvpipeline(input, ch)
		}
		wg.Wait()
	}()

	crossValidate(resultChan)

}

func cvpipeline(s string, output chan lingo.AnnotatedSentence) {
	l := lexer.New(s, strings.NewReader(s))
	pt := pos.New(pos.WithModel(model))

	pt.Input = l.Output
	pt.Output = output

	go l.Run()
	pt.Run()
}


================================================
FILE: cmd/pos/fixer.go
================================================
// +build !chewxy

package main

import (
	"fmt"

	"github.com/chewxy/lingo"
	"github.com/kljensen/snowball"
)

type stemmer struct{}

func (stemmer) Stem(a string) (string, error) {
	return snowball.Stem(a, "english", true)
}

type fixer struct {
	stemmer
}

func (f fixer) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil }
func (f fixer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) {
	return nil, nocomp("lemmatizer")
}

type nocomp string

func (e nocomp) Error() string     { return fmt.Sprintf("no %v", string(e)) }
func (e nocomp) Component() string { return string(e) }


================================================
FILE: cmd/pos/main.go
================================================
package main

import (
	"flag"
	"fmt"
	"log"
	"os"
	"os/signal"
	"runtime/pprof"
	"strings"
	"sync"
	"syscall"
	"time"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/lexer"
	"github.com/chewxy/lingo/pos"
	"github.com/chewxy/lingo/treebank"
)

var save = flag.String("save", "", "save as...")
var load = flag.String("load", "", "load a model")
var clusterFiles = flag.String("cluster", "", "Brown Cluster files. If nothing is passed in, then the brown cluster won't be used")
var trainFile = flag.String("train", "", "Training on... files that end with '.conllu' will be treated as CONLLU formatted files. Files ending with '.zip' will be treted as EWT files")
var testFile = flag.String("test", "", "Test on... Files to cross validate the model on. If this is provided, automatic crossvalidation will be done")
var cv = flag.Bool("cv", false, "Cross validate training model? Defaults to false.")
var epoch = flag.Int("epoch", 1500, "Training epochs. Defaults to 1500")
var inspect = flag.String("inpect", "", "Inspect all the wrong outputs to figure out what went wrong in the POSTagging. This is useful for debugging")
var input = flag.String("input", "", "Input sentence to tag")

var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
var memprofile = flag.String("memprofile", "", "write memory profile to this file")

var clusters map[string]lingo.Cluster
var model *pos.Model

func receive(sentences chan lingo.AnnotatedSentence, wg *sync.WaitGroup) {
	defer wg.Done()
	for sent := range sentences {
		for _, a := range sent {
			fmt.Printf("%#v: %s| %s | %s | %d\n", a, a.POSTag, a.Lemma, a.WordFlag, a.Cluster)
		}
	}
}

func pipeline(s string) {
	l := lexer.New(s, strings.NewReader(s))
	pt := pos.New(pos.WithModel(model))

	pt.Input = l.Output
	var wg sync.WaitGroup

	go l.Run()
	go receive(pt.Output, &wg)

	wg.Add(1)

	pt.Run()
	wg.Wait()
}

func validateFlags() {
	if *load == "" && *trainFile == "" {
		log.Fatal("Must either load a model or pass in a training file")
	}

	if *epoch < 0 {
		log.Fatal("epochs must be positive numbers only!")
	}

	if *testFile != "" {
		*cv = true
	}

	// warnings

	if *load == "" && *save == "" {
		log.Println("WARNING: Models that are trained will NOT be saved")
	}
}

func loadOrTrain() {
	var trained *pos.Tagger
	if *clusterFiles != "" {
		f, err := os.Open(*clusterFiles)
		if err != nil {
			log.Fatal(err)
		}
		clusters = lingo.ReadCluster(f)

		trained = pos.New(pos.WithCluster(clusters), pos.WithStemmer(stemmer{}))
	} else {
		trained = pos.New()
	}

	if *load != "" {
		start := time.Now()
		var err error
		if model, err = pos.Load(*load); err != nil {
			log.Fatal(err)
		}
		log.Printf("Loading model from %q took %v", *load, time.Since(start))
		return
	}

	var sentences []treebank.SentenceTag
	switch {
	case strings.HasSuffix(*trainFile, ".zip"):
		sentences = treebank.LoadEWT(*trainFile)

		// TODO split sentences for crossvalidation

	case strings.HasSuffix(*trainFile, ".conllu"):
		sentences = treebank.LoadUniversal(*trainFile)
	default:
		f, err := os.Open(*trainFile)
		if err != nil {
			log.Fatal(err)
		}

		sentences = treebank.ReadConllu(f)
	}

	log.Printf("Start training for %d epochs...", *epoch)
	start := time.Now()
	trained.Train(sentences, *epoch)
	log.Printf("End Training. Training took %v minutes", time.Since(start).Minutes())

	if *save != "" {
		trained.Save(*save)
		log.Printf("Model saved as: %v", *save)
	}
}

func cleanup(sigChan chan os.Signal, profiling bool) {
	select {
	case <-sigChan:
		log.Println("EMERGENCY EXIT")
		if profiling {
			pprof.StopCPUProfile()
		}
		os.Exit(1)
	}
}

func main() {
	flag.Parse()

	if lingo.BUILD_TAGSET != "stanfordtags" && lingo.BUILD_TAGSET != "universaltags" {
		log.Fatalf("Tagset: %v is unsupported", lingo.BUILD_TAGSET)
	}

	sigChan := make(chan os.Signal, 1)
	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)

	var profiling bool
	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal(err)
		}
		profiling = true
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}

	go cleanup(sigChan, profiling)

	validateFlags()
	loadOrTrain()

	if *memprofile != "" {
		f, err := os.Create(*memprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.WriteHeapProfile(f)
		f.Close()
	}

	if *input != "" {
		pipeline(*input)
	}

	if *cv {
		log.Printf("Cross Validating now")
		testSentences := treebank.LoadUniversal(*testFile)
		testModel(testSentences)
	}

}


================================================
FILE: const.go
================================================
package lingo

// constants that are not pertaining to build tags

var empty struct{}

// NumberWords was generated with this python code
/*
	numberWords = {}

	simple = '''zero one two three four five six seven eight nine ten eleven twelve
	        thirteen fourteen fifteen sixteen seventeen eighteen nineteen
	        twenty'''.split()
	for i, word in zip(xrange(0, 20+1), simple):
	    numberWords[word] = i

	tense = '''thirty forty fifty sixty seventy eighty ninety hundred'''.split()
	for i, word in zip(xrange(30, 100+1, 10), tense):
		numberWords[word] = i

	larges = '''thousand million billion trillion quadrillion quintillion sextillion septillion'''.split()
	for i, word in zip(xrange(3, 24+1, 3), larges):
		numberWords[word] = 10**i
*/
var NumberWords = map[string]int{
	"zero":        0,
	"one":         1,
	"two":         2,
	"three":       3,
	"four":        4,
	"five":        5,
	"six":         6,
	"seven":       7,
	"eight":       8,
	"nine":        9,
	"ten":         10,
	"eleven":      11,
	"twelve":      12,
	"thirteen":    13,
	"fourteen":    14,
	"fifteen":     15,
	"sixteen":     16,
	"nineteen":    19,
	"seventeen":   17,
	"eighteen":    18,
	"twenty":      20,
	"thirty":      30,
	"forty":       40,
	"fifty":       50,
	"sixty":       60,
	"seventy":     70,
	"eighty":      80,
	"ninety":      90,
	"hundred":     100,
	"thousand":    1000,
	"million":     1000000,
	"billion":     1000000000,
	"trillion":    1000000000000,
	"quadrillion": 1000000000000000,
	// "quintillion": 1000000000000000000,
	// "sextillion": 1000000000000000000000,
	// "septillion": 1000000000000000000000000,
}


================================================
FILE: corpus/consopt.go
================================================
package corpus

import (
	"log"
	"sort"
	"sync/atomic"
	"unicode/utf8"

	"github.com/pkg/errors"
	"github.com/xtgo/set"
)

// ConsOpt is a construction option for manual creation of a Corpus
type ConsOpt func(c *Corpus) error

// WithWords creates a corpus from a word list. It may have repeated words
func WithWords(a []string) ConsOpt {
	f := func(c *Corpus) error {
		s := set.Strings(a)
		c.words = s
		c.frequencies = make([]int, len(s))

		ids := make(map[string]int)
		maxID := len(s)

		var totalFreq, maxWL int
		// NOTE: here we're iterating over the set of words
		for i, w := range s {
			runeCount := utf8.RuneCountInString(w)
			if runeCount > c.maxWordLength {
				maxWL = runeCount
			}

			ids[w] = i
		}

		// NOTE: here we're iterating over the original word list.
		for _, w := range a {
			c.frequencies[ids[w]]++
			totalFreq++
		}

		c.ids = ids
		atomic.AddInt64(&c.maxid, int64(maxID))
		c.totalFreq = totalFreq
		c.maxWordLength = maxWL
		return nil
	}
	return f
}

// WithOrderedWords creates a Corpus with the given word order
func WithOrderedWords(a []string) ConsOpt {
	f := func(c *Corpus) error {
		s := a
		c.words = s
		c.frequencies = make([]int, len(s))
		for i := range c.frequencies {
			c.frequencies[i] = 1
		}

		ids := make(map[string]int)
		maxID := len(s)
		totalFreq := len(s)
		var maxWL int
		for i, w := range a {
			runeCount := utf8.RuneCountInString(w)
			if runeCount > c.maxWordLength {
				maxWL = runeCount
			}
			ids[w] = i
		}

		c.ids = ids
		atomic.AddInt64(&c.maxid, int64(maxID))
		c.totalFreq = totalFreq
		c.maxWordLength = maxWL
		return nil
	}
	return f
}

// WithSize preallocates all the things in Corpus
func WithSize(size int) ConsOpt {
	return func(c *Corpus) error {
		c.words = make([]string, 0, size)
		c.frequencies = make([]int, 0, size)
		return nil
	}
}

// FromDict is a construction option to take a map[string]int where the int represents the word ID.
// This is useful for constructing corpuses from foreign sources where the ID mappings are important
func FromDict(d map[string]int) ConsOpt {
	return func(c *Corpus) error {
		var a sortutil
		for k, v := range d {
			a.words = append(a.words, k)
			a.ids = append(a.ids, v)
		}
		sort.Sort(&a)
		c.ids = make(map[string]int)
		for i, w := range a.words {
			if i != a.ids[i] {
				return errors.Errorf("Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!", i, i, a.ids[i])
			}
			c.words = append(c.words, w)
			c.frequencies = append(c.frequencies, 1)
			c.ids[w] = i

			c.totalFreq++
			runeCount := utf8.RuneCountInString(w)
			if runeCount > c.maxWordLength {
				log.Printf("FD MaxWordLength %d - %q", runeCount, w)
				c.maxWordLength = runeCount
			}
		}
		c.maxid = int64(len(a.words))
		return nil
	}

}

// FromDictWithFreq is like FromDict, but also has a frequency.
func FromDictWithFreq(d map[string]struct{ ID, Freq int }) ConsOpt {
	return func(c *Corpus) error {
		var a sortutil
		for k, v := range d {
			a.words = append(a.words, k)
			a.ids = append(a.ids, v.ID)
			a.freqs = append(a.freqs, v.Freq)
		}
		sort.Sort(&a)
		c.ids = make(map[string]int)
		for i, w := range a.words {
			if i != a.ids[i] {
				return errors.Errorf("Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!", i, i, a.ids[i])
			}
			c.words = append(c.words, w)
			c.frequencies = append(c.frequencies, a.freqs[i])
			c.ids[w] = i

			c.totalFreq += a.freqs[i]
			runeCount := utf8.RuneCountInString(w)
			if runeCount > c.maxWordLength {
				c.maxWordLength = runeCount
			}
		}
		c.maxid = int64(len(a.words))
		return nil
	}
}


================================================
FILE: corpus/corpus.go
================================================
package corpus

import (
	"sync/atomic"
	"unicode/utf8"

	"github.com/pkg/errors"
)

// Corpus is a data structure holding the relevant metadata and information for a corpus of text.
// It serves as vocabulary with ID for lookup. This is very useful as neural networks rely on the IDs rather than the text themselves
type Corpus struct {
	words       []string
	frequencies []int

	ids map[string]int

	// atomic read and write plz
	maxid         int64
	totalFreq     int
	maxWordLength int
}

// New creates a new *Corpus
func New() *Corpus {
	c := &Corpus{
		words:       make([]string, 0),
		frequencies: make([]int, 0),
		ids:         make(map[string]int),
	}

	// add some default words
	c.Add("") // aka NULL - when there are no words
	c.Add("-UNKNOWN-")
	c.Add("-ROOT-")
	c.maxWordLength = 0 // specials don't have lengths

	return c
}

// Construct creates a Corpus given the construction options. This allows for more flexibility
func Construct(opts ...ConsOpt) (*Corpus, error) {
	c := new(Corpus)

	// checks
	if c.words == nil {
		c.words = make([]string, 0)
	}
	if c.frequencies == nil {
		c.frequencies = make([]int, 0)
	}
	if c.ids == nil {
		c.ids = make(map[string]int)
	}

	for _, opt := range opts {
		if err := opt(c); err != nil {
			return nil, err
		}
	}

	return c, nil
}

// ID returns the ID of a word and whether or not it was found in the corpus
func (c *Corpus) Id(word string) (int, bool) {
	id, ok := c.ids[word]
	return id, ok
}

// Word returns the word given the ID, and whether or not it was found in the corpus
func (c *Corpus) Word(id int) (string, bool) {
	size := atomic.LoadInt64(&c.maxid)
	maxid := int(size)

	if id >= maxid {
		return "", false
	}
	return c.words[id], true
}

// Add adds a word to the corpus and returns its ID. If a word was previously in the corpus, it merely updates the frequency count and returns the ID
func (c *Corpus) Add(word string) int {
	if id, ok := c.ids[word]; ok {
		c.frequencies[id]++
		c.totalFreq++
		return id
	}

	id := atomic.AddInt64(&c.maxid, 1)
	c.ids[word] = int(id - 1)
	c.words = append(c.words, word)
	c.frequencies = append(c.frequencies, 1)
	c.totalFreq++

	runeCount := utf8.RuneCountInString(word)
	if runeCount > c.maxWordLength {
		c.maxWordLength = runeCount
	}

	return int(id - 1)
}

// Size returns the size of the corpus.
func (c *Corpus) Size() int {
	size := atomic.LoadInt64(&c.maxid)
	return int(size)
}

// WordFreq returns the frequency of the word. If the word wasn't in the corpus, it returns 0.
func (c *Corpus) WordFreq(word string) int {
	id, ok := c.ids[word]
	if !ok {
		return 0
	}

	return c.frequencies[id]
}

// IDFreq returns the frequency of a word given an ID. If the word isn't in the corpus it returns 0.
func (c *Corpus) IDFreq(id int) int {
	size := atomic.LoadInt64(&c.maxid)
	maxid := int(size)

	if id >= maxid {
		return 0
	}
	return c.frequencies[id]
}

// TotalFreq returns the total number of words ever seen by the corpus. This number includes the count of repeat words.
func (c *Corpus) TotalFreq() int {
	return c.totalFreq
}

// MaxWordLength returns the length of the longest known word in the corpus.
func (c *Corpus) MaxWordLength() int {
	return c.maxWordLength
}

// WordProb returns the probability of a word appearing in the corpus.
func (c *Corpus) WordProb(word string) (float64, bool) {
	id, ok := c.Id(word)
	if !ok {
		return 0, false
	}

	count := c.frequencies[id]
	return float64(count) / float64(c.totalFreq), true

}

// Merge combines two corpuses. The receiver is the one that is mutated.
func (c *Corpus) Merge(other *Corpus) {
	for i, word := range other.words {
		freq := other.frequencies[i]
		if id, ok := c.ids[word]; ok {
			c.frequencies[id] += freq
			c.totalFreq += freq
		} else {
			id := c.Add(word)
			c.frequencies[id] += freq - 1
			c.totalFreq += freq - 1
		}
	}
}

// Replace replaces the content of a word. The old reference remains.
//
// e.g: c.Replace("foo", "bar")
// c.Id("foo") will still return a ID. The ID will be the same as c.Id("bar")
func (c *Corpus) Replace(a, with string) error {
	old, ok := c.ids[a]
	if !ok {
		return errors.Errorf("Cannot replace %q with %q. %q is not found", a, with, a)
	}
	if _, ok := c.ids[with]; ok {
		return errors.Errorf("Cannot replace %q with %q. %q exists in the corpus", a, with, with)
	}
	c.words[old] = with
	return nil

}

// ReplaceWord replaces the word associated with the given ID. The old reference remains.
func (c *Corpus) ReplaceWord(id int, with string) error {
	if id >= len(c.words) {
		return errors.Errorf("Cannot replace word with ID %d. Out of bounds.", id)
	}
	if _, ok := c.ids[with]; ok {
		return errors.Errorf("Cannot replace word with ID %d with %q. %q exists in the corpus", id, with, with)
	}
	c.words[id] = with
	return nil
}


================================================
FILE: corpus/corpus_test.go
================================================
package corpus

import (
	"testing"

	"github.com/stretchr/testify/assert"
)

func TestCorpus(t *testing.T) {
	assert := assert.New(t)
	dict := New()
	assert.Equal(0, dict.WordFreq("hello")) // frequency of a word not in dict ould have to be 0
	assert.Equal(0, dict.IDFreq(3))         // ditto

	id := dict.Add("hello")

	assert.Equal(3, id)
	assert.Equal([]string{"", "-UNKNOWN-", "-ROOT-", "hello"}, dict.words)
	assert.Equal(map[string]int{"": 0, "-UNKNOWN-": 1, "-ROOT-": 2, "hello": 3}, dict.ids)
	assert.Equal(4, dict.Size())

	id2, ok := dict.Id("hello")
	if !ok {
		t.Errorf("The ID of null should be  0")
	}
	assert.Equal(id, id2)

	word, ok := dict.Word(3)
	if !ok {
		t.Errorf("Expected word of ID 3 to be found")
	}
	assert.Equal("hello", word)

	dict.Add(word)
	assert.Equal(2, dict.WordFreq(word))
	assert.Equal(2, dict.IDFreq(3))
	assert.Equal(5, dict.TotalFreq())
	assert.Equal(5, dict.MaxWordLength())

	prob, ok := dict.WordProb(word)
	if !ok {
		t.Errorf("Expected a probability")
	}
	assert.Equal(0.4, prob)
	// t.Logf("%q: %v", word, dict.WordProb(word))
}

func TestCorpus_Merge(t *testing.T) {
	assert := assert.New(t)

	dict := New()
	id := dict.Add("hello")
	dict.frequencies[id] += 4 // freq for "hello" is 5
	dict.totalFreq += 4

	other := New()
	id = other.Add("hello")
	other.frequencies[id] += 2 // freq for "hello" is 3
	other.totalFreq += 2
	id = other.Add("world")
	other.frequencies[id] += 1
	other.totalFreq += 1

	dict.Merge(other)

	assert.Equal(8, dict.WordFreq("hello"))
	assert.Equal(2, dict.WordFreq("world"))
}


================================================
FILE: corpus/functions.go
================================================
package corpus

import (
	"math"
	"strings"
	"unicode/utf8"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/treebank"
	"github.com/pkg/errors"
)

// GenerateCorpus creates a Corpus given a set of SentenceTag from a training set.
func GenerateCorpus(sentenceTags []treebank.SentenceTag) *Corpus {
	words := make([]string, 3)
	frequencies := make([]int, 3)

	words[0] = ""      // aka NULL, for when no word can be found
	frequencies[0] = 0 // no word is never found

	words[1] = "-UNKNOWN-"
	frequencies[1] = 0

	words[2] = "-ROOT-"
	frequencies[2] = 1

	knownWords := make(map[string]int)
	knownWords[""] = 0
	knownWords["-UNKNOWN-"] = 1
	knownWords["-ROOT-"] = 2

	maxWordLength := 0

	for _, sentenceTag := range sentenceTags {
		for _, lex := range sentenceTag.Sentence {
			id, ok := knownWords[lex.Value]
			if !ok {
				knownWords[lex.Value] = len(words)
				words = append(words, lex.Value)
				frequencies = append(frequencies, 1)

				runeCount := utf8.RuneCountInString(lex.Value)
				if runeCount > maxWordLength {
					maxWordLength = runeCount
				}
			} else {
				frequencies[id]++
			}
		}
	}

	var totals int
	for _, f := range frequencies {
		totals += f
	}

	return &Corpus{words, frequencies, knownWords, int64(len(words)), totals, maxWordLength}
}

// ViterbiSplit is a Viterbi algorithm for splitting words given a corpus
func ViterbiSplit(input string, c *Corpus) []string {
	s := strings.ToLower(input)
	probabilities := []float64{1.0}
	lasts := []int{0}

	runes := []int{}
	for i := range s {
		runes = append(runes, i)
	}
	runes = append(runes, len(s)+1)

	for i := range s {
		probs := make([]float64, 0)
		ls := make([]int, 0)

		// m := maxInt(0, i-c.maxWordLength)

		for j, r := range runes {
			if r > i {
				break
			}

			p, ok := c.WordProb(s[r : i+1])
			if !ok {
				// http://stackoverflow.com/questions/195010/how-can-i-split-multiple-joined-words#comment48879458_481773
				p = (math.Log(float64(1)/float64(c.totalFreq)) - float64(c.maxWordLength) - float64(1)) * float64(i-r) // note it should be i-r not j-i as per the SO post
			}
			prob := probabilities[j] * p

			probs = append(probs, prob)
			ls = append(ls, r)
		}

		maxProb := -math.SmallestNonzeroFloat64
		maxK := -1 << 63
		for j, p := range probs {
			if p > maxProb {
				maxProb = p
				maxK = ls[j]
			}
		}
		probabilities = append(probabilities, maxProb)
		lasts = append(lasts, maxK)
	}

	words := make([]string, 0)
	i := utf8.RuneCountInString(s)

	for i > 0 {
		start := lasts[i]
		words = append(words, s[start:i])
		i = start
	}

	// reverse it
	for i, j := 0, len(words)-1; i < j; i, j = i+1, j-1 {
		words[i], words[j] = words[j], words[i]
	}

	return words
}

// CosineSimilarity measures the cosine similarity of two strings.
func CosineSimilarity(a, b []string) float64 {
	countsA := make([]float64, 0)
	countsB := make([]float64, 0)
	uniques := make(map[string]int)

	// index the strings first
	for _, st := range a {
		s := strings.ToLower(st)
		id, ok := uniques[s]
		if !ok {
			uniques[s] = len(countsA)
			countsA = append(countsA, 1)
			countsB = append(countsB, 0) // create for countsB, but don't add
		} else {
			countsA[id]++
		}
	}

	for _, st := range b {
		s := strings.ToLower(st)
		id, ok := uniques[s]
		if !ok {
			uniques[s] = len(countsA)
			countsA = append(countsA, 0)
			countsB = append(countsB, 1)
		} else {
			countsB[id]++
		}
	}

	magA, err := mag(countsA)
	if err != nil {
		panic(err)
	}

	magB, err := mag(countsB)
	if err != nil {
		panic(err)
	}

	dotProd, err := dot(countsA, countsB)
	if err != nil {
		panic(err)
	}

	return dotProd / (magA * magB)

}

// DamerauLevenshtein calculates the Damerau-Levensthtein distance between two strings. See more at https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
func DamerauLevenshtein(s1 string, s2 string) (distance int) {
	// index by code point, not byte
	r1 := []rune(s1)
	r2 := []rune(s2)

	// the maximum possible distance
	inf := len(r1) + len(r2)

	// if one string is blank, we needs insertions
	// for all characters in the other one
	if len(r1) == 0 {
		return len(r2)
	}

	if len(r2) == 0 {
		return len(r1)
	}

	// construct the edit-tracking matrix
	matrix := make([][]int, len(r1))
	for i := range matrix {
		matrix[i] = make([]int, len(r2))
	}

	// seen characters
	seenRunes := make(map[rune]int)

	if r1[0] != r2[0] {
		matrix[0][0] = 1
	}

	seenRunes[r1[0]] = 0
	for i := 1; i < len(r1); i++ {
		deleteDist := matrix[i-1][0] + 1
		insertDist := (i+1)*1 + 1
		var matchDist int
		if r1[i] == r2[0] {
			matchDist = i
		} else {
			matchDist = i + 1
		}
		matrix[i][0] = minInt(minInt(deleteDist, insertDist), matchDist)
	}

	for j := 1; j < len(r2); j++ {
		deleteDist := (j + 1) * 2
		insertDist := matrix[0][j-1] + 1
		var matchDist int
		if r1[0] == r2[j] {
			matchDist = j
		} else {
			matchDist = j + 1
		}

		matrix[0][j] = minInt(minInt(deleteDist, insertDist), matchDist)
	}

	for i := 1; i < len(r1); i++ {
		var maxSrcMatchIndex int
		if r1[i] == r2[0] {
			maxSrcMatchIndex = 0
		} else {
			maxSrcMatchIndex = -1
		}

		for j := 1; j < len(r2); j++ {
			swapIndex, ok := seenRunes[r2[j]]
			jSwap := maxSrcMatchIndex
			deleteDist := matrix[i-1][j] + 1
			insertDist := matrix[i][j-1] + 1
			matchDist := matrix[i-1][j-1]
			if r1[i] != r2[j] {
				matchDist += 1
			} else {
				maxSrcMatchIndex = j
			}

			// for transpositions
			var swapDist int
			if ok && jSwap != -1 {
				iSwap := swapIndex
				var preSwapCost int
				if iSwap == 0 && jSwap == 0 {
					preSwapCost = 0
				} else {
					preSwapCost = matrix[maxInt(0, iSwap-1)][maxInt(0, jSwap-1)]
				}
				swapDist = i + j + preSwapCost - iSwap - jSwap - 1
			} else {
				swapDist = inf
			}
			matrix[i][j] = minInt(minInt(minInt(deleteDist, insertDist), matchDist), swapDist)
		}
		seenRunes[r1[i]] = i
	}

	return matrix[len(r1)-1][len(r2)-1]
}

// LongestCommonPrefix takes a slice of strings, and finds the longest common prefix
func LongestCommonPrefix(strs ...string) string {
	switch len(strs) {
	case 0:
		return "" // idiots
	case 1:
		return strs[0]
	}

	min := strs[0]
	max := strs[0]

	for _, s := range strs[1:] {
		switch {
		case s < min:
			min = s
		case s > max:
			max = s
		}
	}

	for i := 0; i < len(min) && i < len(max); i++ {
		if min[i] != max[i] {
			return min[:i]
		}
	}

	// In the case where lengths are not equal but all bytes
	// are equal, min is the answer ("foo" < "foobar").
	return min
}

/* The following two functions help in parsing a string into numbers. It's recommended you write abstractions over the functions*/

// StrsToInts converts a string slice into an int slice, with the help of NumberWords.
// The function assumes all helper words like "and" have been stripped.
// 		"One hundred and five" -> []string{"one", "hundred", "five"}
// This is a very primitive method, and doesn't take into account other words like "a hundred" or "a couple of hundred"
func StrsToInts(strs []string) (retVal []int, err error) {
	for _, s := range strs {
		intVal, ok := lingo.NumberWords[s]
		if !ok {
			return nil, errors.Errorf("Unable to parse the words %q as numbers", s)
		}

		if len(retVal) > 0 && intVal == 100 && retVal[len(retVal)-1] < 100 {
			retVal[len(retVal)-1] *= 100
		} else if len(retVal) > 0 && retVal[len(retVal)-1] < 1000 && intVal < 1000 {
			retVal[len(retVal)-1] += intVal
		} else {
			retVal = append(retVal, intVal)
		}
	}
	return
}

// CombineInts takes a int slice, and tries to make it one integer.
// It works by taking advantage of english - anything more than 1000 has a repeated pattern
// e.g.
// 		one hundred and fifty thousand two hundred and two
// there are 2 repeated patterns (one hundred and fifty) and  (two hundred and two)
//
// This allows us to repeatedly combine by addition or multiplication until there is one left
func CombineInts(ints []int) int {
	var total int
	for len(ints) > 0 {
		if len(ints) == 1 || ints[0] >= 1000 {
			last := ints[len(ints)-1]
			total += last
			ints = ints[0 : len(ints)-1] //pop it
		} else {
			if ints[1] < 1000 {
				// something went wrong
				panic("HELP!")
			}
			total += ints[0] * ints[1]
			ints = ints[2:]
		}
	}
	return total
}


================================================
FILE: corpus/functions_test.go
================================================
package corpus

import (
	"strings"
	"testing"

	"github.com/stretchr/testify/assert"
)

func Test_GenerateCorpus(t *testing.T) {
	sentenceTags := mediumSentence()
	dict := GenerateCorpus(sentenceTags)

	// testing time
	assert := assert.New(t)
	expectedWords := []string{"", "-UNKNOWN-", "-ROOT-", "President", "Bush", "on", "Tuesday", "nominated", "two", "individuals", "to", "replace", "retiring", "jurists", "federal", "courts", "in", "the", "Washington", "area", "."}

	expectedIDs := make(map[string]int)
	for i, w := range expectedWords {
		expectedIDs[w] = i
	}

	assert.Equal(expectedWords, dict.words, "Corpus known words should be the same as the manually annotated expected values")
	assert.Equal(expectedIDs, dict.ids, "IDs should be the same as expected IDs")
	assert.Equal(int64(len(expectedWords)), dict.maxid)
}

func TestViterbiSplit(t *testing.T) {
	assert := assert.New(t)
	dict := GenerateCorpus(mediumSentence())

	s2 := "twoindividuals"
	words := ViterbiSplit(s2, dict)
	assert.Equal([]string{"two", "individuals"}, words)

	s2 = "FederalCourts"
	words = ViterbiSplit(s2, dict)
	assert.Equal([]string{"federal", "courts"}, words)

	s3 := "toreplaceon"
	words = ViterbiSplit(s3, dict)
	assert.Equal([]string{"to", "replace", "on"}, words)
}

func TestCosineSimilarity(t *testing.T) {
	a := strings.Split("This is a test of cosine similarity", " ")
	b := strings.Split("This is not a test of cosine similarity", " ")

	s1 := CosineSimilarity(a, a)
	s2 := CosineSimilarity(a, b)

	if !floatEquals64(s1, 1) {
		t.Error("Expected similarity to be 1 when compared with itself")
	}
	if s2 > s1 {
		t.Error("Something went wrong with the cosine similarity algorithm")
	}

	c := strings.Split("Parramatta Road", " ")
	d := strings.Split("Parramatta Rd", " ")

	s1 = CosineSimilarity(c, c)
	s2 = CosineSimilarity(c, d)

	if !floatEquals64(s1, 1) {
		t.Error("Expected similarity to be 1 when compared with itself")
	}
	if s2 > s1 {
		t.Error("Something went wrong with the cosine similarity algorithm")
	}
}

func TestDL(t *testing.T) {
	a := "This is a test of Damerau Levenshtein"
	b := "This is not a test of Damerau Levenshtein"

	s1 := DamerauLevenshtein(a, a)
	s2 := DamerauLevenshtein(a, b)
	if s1 != 0 {
		t.Errorf("Expected the distance to be 0 when compared against itself. Got %d", s1)
	}

	if s2 < s1 {
		t.Error("Expected DL similarity to be greater when compared against itself")
	}

	c := "Parramatta Road"
	d := "Paramatta Rd"

	s1 = DamerauLevenshtein(c, c)
	s2 = DamerauLevenshtein(c, d)

	if s1 != 0 {
		t.Errorf("Expected the distance to be 0 when compared against itself. Got %d", s1)
	}
	if s2 < s1 {
		t.Error("Expected DL similarity to be greater when compared against itself")
	}
}

func TestLCP(t *testing.T) {
	assert := assert.New(t)
	lcp := LongestCommonPrefix("Hello World", "Hell yeah!")
	assert.Equal("Hell", lcp)

	lcp = LongestCommonPrefix("Hello World", "Hell yeah!", "hey there")
	assert.Equal("", lcp)

	lcp = LongestCommonPrefix()
	assert.Equal("", lcp)

	lcp = LongestCommonPrefix("OneWord")
	assert.Equal("OneWord", lcp)

	lcp = LongestCommonPrefix("foo", "foobar")
	assert.Equal("foo", lcp)
}

var parseNumTests = []struct {
	s string
	v int
}{
	{"twenty nine", 29},
	{"one hundred five", 105},
	{"five hundred twenty thousand twenty one", 520021},
}

func TestParseNumber(t *testing.T) {
	for _, pnts := range parseNumTests {
		s := strings.Split(pnts.s, " ")
		ints, err := StrsToInts(s)
		if err != nil {
			t.Error(err)
			continue
		}

		v := CombineInts(ints)
		if v != pnts.v {
			t.Errorf("Expected %q to be parsed to %d. Got %d instead", pnts.s, pnts.v, v)
		}
	}
}


================================================
FILE: corpus/inflection.go
================================================
package corpus

import (
	"regexp"

	"github.com/chewxy/lingo"
)

type conversionPattern struct {
	pattern     *regexp.Regexp
	replacement string
}

func newConversionPattern(from, to string) conversionPattern {
	rFrom := regexp.MustCompile(from)
	return conversionPattern{rFrom, to}
}

// plural -> singular
var plural = []conversionPattern{
	newConversionPattern("(quiz)$", "${1}zes"),
	newConversionPattern("^(ox)$", "${1}en"),
	newConversionPattern("([m|l])ouse$", "${1}ice"),
	newConversionPattern("(matr|vert|ind)ix|ex$", "${1}ices"),
	newConversionPattern("(x|ch|ss|sh)$", "${1}es"),
	newConversionPattern("([^aeiouy]|qu)ies$", "${1}y"),
	newConversionPattern("([^aeiouy]|qu)y$", "${1}ies"),
	newConversionPattern("(hive)$", "${1}s"),
	newConversionPattern("(?:([^f])fe|([lr])f)$", "${1}${2}ves"),
	newConversionPattern("sis$", "ses"),
	newConversionPattern("([ti])um$", "${1}a"),
	newConversionPattern("(buffal|tomat|potat)o$", "${1}oes"),
	newConversionPattern("(bu)s$", "${1}ses"),
	newConversionPattern("(alias|status|sex)$", "${1}es"),
	newConversionPattern("(octop|vir)us$", "${1}i"),
	newConversionPattern("(ax|test)is$", "${1}es"),
	newConversionPattern("s$", "s"),
	newConversionPattern("$", "s"),
}

// singular -> plural
var singular = []conversionPattern{
	newConversionPattern("(quiz)zes$", "${1}"),
	newConversionPattern("(matr)ices$", "${1}ix"),
	newConversionPattern("(vert|ind)ices$", "${1}ex"),
	newConversionPattern("^(ox)en", "${1}"),
	newConversionPattern("(alias|status)es$", "${1}"),
	newConversionPattern("(octop|vir)i$", "${1}us"),
	newConversionPattern("(cris|ax|test)es$", "${1}is"),
	newConversionPattern("(shoe)s$", "${1}"),
	newConversionPattern("(o)es$", "${1}"),
	newConversionPattern("(bus)es$", "${1}"),
	newConversionPattern("([m|l])ice$", "${1}ouse"),
	newConversionPattern("(x|ch|ss|sh)es$", "${1}"),
	newConversionPattern("(m)ovies$", "${1}ovie"),
	newConversionPattern("(s)eries$", "${1}eries"),
	newConversionPattern("([^aeiouy]|qu)ies$", "${1}y"),
	newConversionPattern("([lr])ves$", "${1}f"),
	newConversionPattern("(tive)s$", "${1}"),
	newConversionPattern("(hive)s$", "${1}"),
	newConversionPattern("([^f])ves$", "${1}fe"),
	newConversionPattern("(^analy)ses$", "${1}sis"),
	newConversionPattern("((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "${1}${2}sis"),
	newConversionPattern("([ti])a$", "${1}um"),
	newConversionPattern("(n)ews$", "${1}ews"),
	newConversionPattern("s$", ""),
}

// weird pluralizations that don't match the rules above
var irregular = []conversionPattern{
	newConversionPattern("person", "people"),
	newConversionPattern("man", "men"),
	newConversionPattern("child", "children"),
	newConversionPattern("sex", "sexes"),
	newConversionPattern("move", "moves"),
	newConversionPattern("sleeve", "sleeves"),
	newConversionPattern("datum", "data"),
	newConversionPattern("box", "boxes"),
	newConversionPattern("knife", "knives"),
}

var unconvertable = []string{
	"equipment",
	"information",
	"rice",
	"money",
	"species",
	"series",
	"fish",
	"sheep",
}

// Pluralize pluralizes words based on rules known
func Pluralize(word string) string {
	if lingo.InStringSlice(word, unconvertable) {
		return word
	}

	for _, cp := range irregular {
		if cp.pattern.MatchString(word) {
			return cp.replacement
		}
	}

	for _, cp := range plural {
		if cp.pattern.MatchString(word) {
			// log.Printf("\t%q Matches %q", word, cp.pattern.String())
			return cp.pattern.ReplaceAllString(word, cp.replacement)
		}
	}
	return word
}

// Singularize singularizes words based on rules known
func Singularize(word string) string {
	if lingo.InStringSlice(word, unconvertable) {
		return word
	}

	for _, cp := range singular {
		if cp.pattern.MatchString(word) {
			return cp.pattern.ReplaceAllString(word, cp.replacement)
		}
	}
	return word
}


================================================
FILE: corpus/inflection_test.go
================================================
package corpus

import "testing"

var pluralizeTest = []struct {
	word, correct string
}{
	{"friend", "friends"},
	{"tomato", "tomatoes"},
	{"knife", "knives"},
	{"dwarf", "dwarves"},
	{"box", "boxes"},
	{"ox", "oxen"},
	{"man", "men"},
	{"equipment", "equipment"},
}

var singularizeTest = []struct {
	word, correct string
}{
	{"condolences", "condolence"},
	{"fish", "fish"},
	{"shoes", "shoe"},
	{"viri", "virus"},
	{"elves", "elf"},
}

func TestPluralize(t *testing.T) {
	for _, pts := range pluralizeTest {
		got := Pluralize(pts.word)
		if got != pts.correct {
			t.Errorf("Pluralizing %q failed. Want %q. Got %q instead", pts.word, pts.correct, got)
		}
	}
}

func TestSingularize(t *testing.T) {
	for _, pts := range singularizeTest {
		got := Singularize(pts.word)
		if got != pts.correct {
			t.Errorf("Singularizing %q failed. Want %q. Got %q instead", pts.word, pts.correct, got)
		}
	}
}


================================================
FILE: corpus/io.go
================================================
package corpus

import (
	"bufio"
	"bytes"
	"encoding/gob"
	"io"
	"strconv"
	"strings"
)

// sortutil is a utility struct meant to sort words based on IDs
type sortutil struct {
	words []string
	ids   []int
	freqs []int
}

func (s *sortutil) Len() int           { return len(s.words) }
func (s *sortutil) Less(i, j int) bool { return s.ids[i] < s.ids[j] }
func (s *sortutil) Swap(i, j int) {
	s.words[i], s.words[j] = s.words[j], s.words[i]
	s.ids[i], s.ids[j] = s.ids[j], s.ids[i]
	if len(s.freqs) > 0 {
		s.freqs[i], s.freqs[j] = s.freqs[j], s.freqs[i]
	}
}

// ToDictWithFreq returns a simple marshalable type. Conceptually it's a JSON object with the words as the keys. The values are a pair - ID and Freq.
func ToDictWithFreq(c *Corpus) map[string]struct{ ID, Freq int } {
	retVal := make(map[string]struct{ ID, Freq int })
	for i, w := range c.words {
		retVal[w] = struct{ ID, Freq int }{i, c.frequencies[i]}
	}
	return retVal
}

// ToDict returns a marshalable dict. It returns a copy of the ID mapping.
func ToDict(c *Corpus) map[string]int {
	retVal := make(map[string]int)
	for k, v := range c.ids {
		retVal[k] = v
	}
	return retVal
}

// GobEncode implements GobEncoder for *Corpus
func (c *Corpus) GobEncode() ([]byte, error) {
	var buf bytes.Buffer
	encoder := gob.NewEncoder(&buf)

	if err := encoder.Encode(c.words); err != nil {
		return nil, err
	}

	if err := encoder.Encode(c.ids); err != nil {
		return nil, err
	}

	if err := encoder.Encode(c.frequencies); err != nil {
		return nil, err
	}

	if err := encoder.Encode(c.maxid); err != nil {
		return nil, err
	}

	if err := encoder.Encode(c.totalFreq); err != nil {
		return nil, err
	}

	if err := encoder.Encode(c.maxWordLength); err != nil {
		return nil, err
	}

	return buf.Bytes(), nil
}

// GobDecode implements GobDecoder for *Corpus
func (c *Corpus) GobDecode(buf []byte) error {
	b := bytes.NewBuffer(buf)
	decoder := gob.NewDecoder(b)

	if err := decoder.Decode(&c.words); err != nil {
		return err
	}

	if err := decoder.Decode(&c.ids); err != nil {
		return err
	}

	if err := decoder.Decode(&c.frequencies); err != nil {
		return err
	}

	if err := decoder.Decode(&c.maxid); err != nil {
		return err
	}

	if err := decoder.Decode(&c.totalFreq); err != nil {
		return err
	}

	if err := decoder.Decode(&c.maxWordLength); err != nil {
		return err
	}

	return nil
}

// LoadOneGram loads a 1_gram.txt file, which is a tab separated file which lists the frequency counts of words. Example:
// 		the	23135851162
// 		of	13151942776
// 		and	12997637966
// 		to	12136980858
// 		a	9081174698
// 		in	8469404971
// 		for	5933321709
func (c *Corpus) LoadOneGram(r io.Reader) error {
	scanner := bufio.NewScanner(r)
	for scanner.Scan() {
		line := scanner.Text()
		splits := strings.Split(line, "\t")

		if len(splits) == 0 {
			break
		}

		word := splits[0] // TODO: normalize
		count, err := strconv.Atoi(splits[1])
		if err != nil {
			return err
		}

		id := c.Add(word)
		c.frequencies[id] = count
		c.totalFreq--
		c.totalFreq += count

		wc := len([]rune(word))
		if wc > c.maxWordLength {
			c.maxWordLength = wc
		}
	}
	return nil
}


================================================
FILE: corpus/io_test.go
================================================
package corpus

import (
	"bytes"
	"encoding/gob"
	"strings"
	"testing"

	"github.com/stretchr/testify/assert"
)

func TestCorpusGob(t *testing.T) {
	buf := new(bytes.Buffer)

	c := New()
	c.Add("Hello")
	c.Add("World")

	helloID, _ := c.Id("Hello")
	worldID, _ := c.Id("World")

	encoder := gob.NewEncoder(buf)
	decoder := gob.NewDecoder(buf)

	if err := encoder.Encode(c); err != nil {
		t.Fatal(err)
	}

	c2 := New()
	if err := decoder.Decode(c2); err != nil {
		t.Fatal(err)
	}

	if hid, ok := c2.Id("Hello"); !ok || (ok && hid != helloID) {
		t.Errorf("\"Hello\" not found after decoding.")
	}

	if wid, ok := c2.Id("World"); !ok || (ok && wid != worldID) {
		t.Errorf("\"World\" not found after decoding.")
	}
}

func TestCorpusToDict(t *testing.T) {
	assert := assert.New(t)
	c, _ := Construct(WithWords([]string{"World", "Hello", "World"}))

	d := ToDict(c)
	c2, err := Construct(FromDict(d))
	if err != nil {
		t.Fatal(err)
	}
	assert.Equal(c.words, c2.words, "Expected words to be the same")
	assert.Equal(c.ids, c2.ids, "Expected IDs to be the same")
	assert.NotEqual(c.frequencies, c2.frequencies, "Expected frequencies to not be the same")
	assert.Equal(c.maxid, c2.maxid, "Expected maxID to be the same")
	assert.NotEqual(c.totalFreq, c2.totalFreq, "Expected totalFreq to be different.")
	assert.Equal(c.maxWordLength, c2.maxWordLength, "Expected maxWordLength to be the same")
}

func TestCorpusToDictWithFreq(t *testing.T) {
	assert := assert.New(t)
	c, _ := Construct(WithWords([]string{"World", "Hello", "World"}))

	d := ToDictWithFreq(c)
	c2, err := Construct(FromDictWithFreq(d))
	if err != nil {
		t.Fatal(err)
	}

	assert.Equal(c, c2)
}

func TestLoadOneGram(t *testing.T) {
	assert := assert.New(t)
	r := strings.NewReader(sample1Gram)

	c := New()
	err := c.LoadOneGram(r)
	assert.Nil(err)
	assert.Equal(10, c.Size())

	id, ok := c.Id("for")
	if !ok {
		t.Errorf("Expected \"for\" to be in corpus after loading one gram file")
	}
	assert.Equal(int(c.maxid-1), id)

}


================================================
FILE: corpus/lda.go
================================================
package corpus

import (
	"gorgonia.org/tensor"
)

// LDAModel ... TODO
//https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation
type LDAModel struct {
	// params
	Alpha tensor.Tensor // is a Row
	Eta   tensor.Tensor // is a Col
	//Kappa gorgonia.Scalar // Decay
	//Tau0  gorgonia.Scalar // offset

	// parameters needed for working
	Topics      int
	ChunkSize   int
	Terms       int
	UpdateEvery int
	EvalEvery   int

	// consts
	Iterations     int
	GammaThreshold float64

	MinimumProb float64

	// track current progress
	Updates int

	// type
	Dtype tensor.Dtype
}

func (l *LDAModel) init() {
	eta := tensor.New(tensor.Of(l.Dtype), tensor.WithShape(l.Topics))
	alpha := tensor.New(tensor.Of(l.Dtype), tensor.WithShape(l.Topics))

	switch l.Dtype {
	case tensor.Float64:
		v := 1.0 / float64(l.Topics)
		eta.Memset(v)
		alpha.Memset(v)
	case tensor.Float32:
		v := float32(1) / float32(l.Topics)
		eta.Memset(v)
		alpha.Memset(v)
	}

	l.Alpha = alpha
	l.Eta = eta
}


================================================
FILE: corpus/test_test.go
================================================
package corpus

import (
	"strings"

	"github.com/chewxy/lingo/treebank"
)

const sample1Gram = `the	23135851162
of	13151942776
and	12997637966
to	12136980858
a	9081174698
in	8469404971
for	5933321709`

func mediumSentence() []treebank.SentenceTag {
	conllu := `1	President	President	PROPN	NNP	Number=Sing	2	compound	_	_
2	Bush	Bush	PROPN	NNP	Number=Sing	5	nsubj	_	_
3	on	on	ADP	IN	_	4	case	_	_
4	Tuesday	Tuesday	PROPN	NNP	Number=Sing	5	nmod	_	_
5	nominated	nominate	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	0	root	_	_
6	two	two	NUM	CD	NumType=Card	7	nummod	_	_
7	individuals	individual	NOUN	NNS	Number=Plur	5	dobj	_	_
8	to	to	PART	TO	_	9	mark	_	_
9	replace	replace	VERB	VB	VerbForm=Inf	5	advcl	_	_
10	retiring	retire	VERB	VBG	VerbForm=Ger	11	amod	_	_
11	jurists	jurist	NOUN	NNS	Number=Plur	9	dobj	_	_
12	on	on	ADP	IN	_	14	case	_	_
13	federal	federal	ADJ	JJ	Degree=Pos	14	amod	_	_
14	courts	court	NOUN	NNS	Number=Plur	11	nmod	_	_
15	in	in	ADP	IN	_	18	case	_	_
16	the	the	DET	DT	Definite=Def|PronType=Art	18	det	_	_
17	Washington	Washington	PROPN	NNP	Number=Sing	18	compound	_	_
18	area	area	NOUN	NN	Number=Sing	14	nmod	_	_
19	.	.	PUNCT	.	_	5	punct	_	_

`

	readr := strings.NewReader(conllu)
	return treebank.ReadConllu(readr)
}

const EPSILON64 float64 = 1e-10

func floatEquals64(a, b float64) bool {
	if (a-b) < EPSILON64 && (b-a) < EPSILON64 {
		return true
	}
	return false
}


================================================
FILE: corpus/utils.go
================================================
package corpus

import (
	"errors"
	"math"
)

func minInt(a, b int) int {
	if a < b {
		return a
	}
	return b
}

func maxInt(a, b int) int {
	if a > b {
		return a
	}
	return b
}

func dot(a, b []float64) (float64, error) {
	if len(a) != len(b) {
		return 0, errors.New("Differing lengths!")
	}

	var retVal float64
	for i, v := range a {
		retVal += v * b[i]
	}
	return retVal, nil
}

func mag(a []float64) (float64, error) {
	dotProd, err := dot(a, a)
	if err != nil {
		return dotProd, err
	}
	return math.Sqrt(dotProd), nil
}


================================================
FILE: dep/README.md
================================================
# Dependency Parser #

Package `dependencyparser` is a package that provides data structures and algorithms for a dependency parser as described by [Chen and Manning 2014](http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf) [PDF]. It achieves similar accuracy scores as the the cited paper.

# Installing #

`go get -u github.com/chewxy/lingo/dep`


# How It Works #

## Transition Based Parsing ##

The core of the parser is a transition based parser, as popularized by [Nivre 2003](https://stp.lingfil.uu.se/~nivre/docs/iwpt03.pdf) [PDF]. It's essentially a [shift-reduce parser](https://en.wikipedia.org/wiki/Shift-reduce_parser) with more states. Dan Jurafsky has a very [complete overview of transition-based parsing](https://web.stanford.edu/~jurafsky/slp3/14.pdf) [PDF], which should be consulted should more questions arise.

### Transitions ###

At the core of a transition based parser are two data structures: a stack and a queue. The queue, or buffer holds a list of words waiting to be parsed. Parsing is then simply a matter of manipulating the state of the stack and queue. Specifically there are three possible actions in an arc-standard parser:

* `Shift`: Shift simply shifts one word from the buffer on to the top of the stack
* `Left`: Left means the top of the stack is the head of the word underneath it. After the transition is applied (the link between the nodes attached), the word underneath the stack is removed.
* `Right`: Right means that the top of the stack is the child of the word underneath it. After the transition is applied, the top of the stack is popped.

A word on the terms "head", and "child". Consider the sentence "I am human":

!["I am human" example](https://github.com/chewxy/lingo/blob/master/dep/documentation/iamhuman.dot.png?raw=true)

We say "human" is the head of the words "I" and "am". Therefore, "I" and "am" are considered to be children of "human".

### Example ###

Let's look at a simple example to concrefy the ideas: "The cat sat on the mat". Here are the states

| Step | Stack                         | Buffer                                    | Transition |
|------|-------------------------------|-------------------------------------------|------------|
|0 | [ROOT]                            | ["The", "cat", "sat", "on", "the", "mat"] | Shift      |
|1 | [ROOT, "The"]                     | ["cat", "sat", "on", "the", "mat"]        | Shift      |
|2 | [ROOT, "The", "cat"]              | ["sat", "on", "the", "mat"]               | Left       | 
|3 | [ROOT, "cat"]                     | ["sat", "on", "the", "mat"]               | Shift      |
|4 | [ROOT, "cat", "sat"]              | ["on", "the", "mat"]                      | Left       |
|5 | [ROOT, "sat"]                     | ["on", "the", "mat"]                      | Shift      |
|6 | [ROOT, "sat", "on"]               | ["the", "mat"]                            | Shift      |
|7 | [ROOT, "sat", "on", "the"]        | ["mat"]                                   | Shift      |
|8 | [ROOT, "sat", "on", "the", "mat"] | []                                        | Left       |
|9 | [ROOT, "sat", "on", "mat"]        | []                                        | Left       |
|10| [ROOT, "sat", "mat"]              | []                                        | Right      |
|11| [ROOT, "sat"]                     | []                                        | Left       |

The above transitions produces this parse tree:

!["the cat sat on the mat"](https://github.com/chewxy/lingo/blob/master/dep/documentation/thecatsatonthemat.dot.png?raw=true)

The real question then is of course - how does the system know which is the correct transition to emit, given the state?

The answer is machine learning.

## Machine Learning ##

What exactly are we learning? Or more carefully put, what are the inputs and outputs of the machine learning algorithm? The table in the example above provides a template for the inputs and output. The output is easy - the transition is what we want to learn. 

As for the input, it's a little bit more complex. The input consists of the stack and the buffer. It'd be impractical and slow to include everything in the stack and buffer (dynamic neural networks are somewhat slower than static ones). So Chen and Manning came up with an ingenious idea - 

* Use the top 3 words of the stack
* Use the top 3 words of the buffer
* Use the first and second leftmost/rightmost children of the first two words of the stack

Instead of directly using the words, POS Tag and dependency relations as features, the rather ingenious idea was that it would use vectors drawn from an embedding matrix to represent these features instead. So instead of building sparse features, concatenating the vectors form a fixed sized input vector. This makes training the network much more expedient. 
You'll find this in [features.go](https://github.com/chewxy/lingo/blob/master/dependencyParser/features.go)

Given each state above, it'd be fairly trivial to extract an input vector based on the 18 "features" listed and feed forwards to a neural network. The result is a fast parser.

### Neural Network ###

The machine learning algorithm behind this parser is a simple 3-layered network. An input layer is constructed from the embedding matrices, and is forwarded to the first layer, which is activated by a cube activation function. This then passes forwards to a dropout layer before the last layer, which is a softmax layer.

[image of NN] 

## Hairy Bits ##

The hairy bits of this is the oracle. Specifically, the question: given a training sentence, how do we generate correct examples such as the table above? 

TODO: finish writing this section


# How To Use #

This package provides three main data structures for use:

* `Parser`
* `Model`
* `Trainer`

`Trainer` takes a `[]treebank.SentenceTag` and produces a `Model`. `Parser` requires a `Model` to run, and is basically a exported wrapper over `configuration` that handles a pipeline.

## Basic NLP Pipeline ##

```go
func main() {
	inputString: `The cat sat on the mat`
	lx := lexer.New("dummy", strings.NewReader(inputString)) // lexer - required to break a sentence up into words. 
	pt := pos.New(pos.WithModel(posModel))                   // POS Tagger - required to tag the words with a part of speech tag.
	dp := dep.New(depModel)                                  // Creates a new parser

	// set up a pipeline
	pt.Input = lx.Output
	dp.Input = pt.Output

	// run all
	go lx.Run()
	go pt.Run()
	go dp.Run()

	// wait to receive:
	for {
		select {
		case d := <- dp.Output:
			// do something
		case err:= <-dp.Error:
			// handle error
		}
	}

}
```

## Training A Model ##

To train a model you'd use the `Trainer`. The trainer accepts a `[]treebank.SentenceTag`. As long as you can parse your training file into those (package `treebank` accepts CONLLU formatted files as well as the PennTreebank formatted files), you'd be fine.

An example trainer is in the cmd directory of `lingo`

# FAQ #

**Why not an LSTM or RNN to encode the state of the stack and buffer?**

The answer is simplicity and speed. I have attempted variants of the parser with different neural networks - they don't work as fast as this. I am aware of Parsey-McParseface and the slightly improved accuracy compared to this model, but the speed has been not as great as I expect. This package emphasises parsing speed over accuracy - for most well written English sentences, this package performs well.

**Why are there no models?**

I'm afraid you're gonna have to train your own models. Training takes days on the Universal Dependency dataset and I haven't had the time to train on those. All my models are specific to the use of the company, and hence cannot be released.

**What caveats are there?**

Chen and Manning described using pre-computed activations for the top 10000 or so words. I did not implement that, but it would be trivial to revisit and implement it. Feel free to send a pull request.

**How can this be sped up?**

Use multiple, smaller trainers, each training on a separate batch. You can hence train them concurrently (pass the costs in a channel and collect at the end). At the end, sum the gradients before applying adagrad. The trade off is that a LOT more memory will be used. It's also the reason why it wasn't included as the default. It's quite trivial to write though. Send a pull request if you have managed to reduce memory usage.


# Contributing #

see package lingo's CONTRIBUTING.md for more information. There is currently a list of issues in Github issues. Those are good places to start.

# Licence #

This package is MIT licenced.

================================================
FILE: dep/arcStandard.go
================================================
package dep

import "github.com/chewxy/lingo"

// var SingleRoot bool = true // make this part of a build process

// canApply checks if a particular transition can be applied
func (c *configuration) canApply(t transition) bool {

	var h head
	if t.Move == Left || t.Move == Right {
		if t.Move == Left {
			h = c.stackValue(0)
		} else {
			h = c.stackValue(1)
		}

		if h < 0 {
			return false
		}
		if h == 0 && t.DependencyType != lingo.Root {
			return false
		}
	}

	stackSize := c.stackSize()
	bufferSize := c.bufferSize()

	if t.Move == Left {
		return stackSize > 2
	}

	if t.Move == Right {
		return stackSize > 2 || (stackSize == 2 && bufferSize == 0)

		// if not single root build
		// return stackSize >= 2
	}

	return bufferSize > 0 // strange other thing...

}

// apply applies the transition
func (c *configuration) apply(t transition) {
	logf("Applying %v", t)
	w1 := int(c.stackValue(1))
	w2 := int(c.stackValue(0))

	if t.Move == Left {
		c.AddArc(w2, w1, t.DependencyType)
		c.removeSecondTopStack()
	} else if t.Move == Right {
		c.AddArc(w1, w2, t.DependencyType)
		c.removeTopStack()
	} else {
		c.shift()
	}
}

// oracle gets the gold transition given the state
func (c *configuration) oracle(goldParse *lingo.Dependency) (t transition) {
	w1 := int(c.stackValue(1))
	w2 := int(c.stackValue(0))

	if w1 > 0 && goldParse.Head(w1) == w2 {
		t.Move = Left
		t.DependencyType = goldParse.Label(w1)
		return
	} else if w1 >= 0 && goldParse.Head(w2) == w1 && !c.hasOtherChildren(w2, goldParse) {
		t.Move = Right
		t.DependencyType = goldParse.Label(w2)

		return
	}
	return // default transition is Shift
}


================================================
FILE: dep/arcStandard_test.go
================================================
package dep

import (
	"testing"

	"github.com/chewxy/lingo"
	"github.com/stretchr/testify/assert"
)

func TestCanApply(t *testing.T) {
	dep := simpleSentence()[0].Dependency(dummyFix{})

	buffer := make([]head, 0)
	for i := 1; i < dep.WordCount(); i++ {
		buffer = append(buffer, head(i))
	}

	stack := []head{0}

	c := &configuration{
		Dependency: dep,
		stack:      stack,
		buffer:     buffer,
	}

	assert := assert.New(t)

	logf("Start config: \n%v", c)

	rootLeft := c.canApply(transition{Left, lingo.Root})
	rootRight := c.canApply(transition{Right, lingo.Root})
	NSubjLeft := c.canApply(transition{Left, lingo.NSubj})
	NSubjRight := c.canApply(transition{Right, lingo.NSubj})
	ShiftDep := c.canApply(transition{Shift, lingo.NoDepType})

	assert.Equal(false, rootLeft, "rootLeft should be false")
	assert.Equal(false, rootRight, "rootRight should be false")
	assert.Equal(false, NSubjLeft, "NSubjLeft should be false")
	assert.Equal(false, NSubjRight, "NSubjRight should be false")
	assert.Equal(true, ShiftDep, "ShiftDep should be true")

	logf("rootRight: %v, rootLeft: %v", rootLeft, rootRight)
	logf("NSubjRight: %v, NSubjLeft: %v", NSubjRight, NSubjLeft)
	logf("ShiftDep: %v", ShiftDep)

	c.shift()
	c.shift()
	logf("%v", c)

	rootLeft = c.canApply(transition{Left, lingo.Root})
	rootRight = c.canApply(transition{Right, lingo.Root})
	NSubjLeft = c.canApply(transition{Left, lingo.NSubj})
	NSubjRight = c.canApply(transition{Right, lingo.NSubj})
	ShiftDep = c.canApply(transition{Shift, lingo.NoDepType})

	assert.Equal(true, rootLeft, "rootLeft should be true")
	assert.Equal(true, rootRight, "rootRight should be true")
	assert.Equal(true, NSubjLeft, "NSubjLeft should be true")
	assert.Equal(true, NSubjRight, "NSubjRight should be true")
	assert.Equal(true, ShiftDep, "ShiftDep should be true")

	logf("rootRight: %v, rootLeft: %v", rootLeft, rootRight)
	logf("NSubjRight: %v, NSubjLeft: %v", NSubjRight, NSubjLeft)
	logf("ShiftDep: %v", ShiftDep)
}

func TestOracle(t *testing.T) {
	st := simpleSentence()[0]
	s := st.AnnotatedSentence(nil)
	c := newConfiguration(s, true)
	d := s.Dependency()

	for count := 0; !c.isTerminal() && count < 100; count++ {
		oracle := c.oracle(d)

		if !c.canApply(oracle) && (oracle != transition{Right, lingo.Root}) {
			t.Errorf("Cannot apply %v", oracle)
			break
		}

		c.apply(oracle)
	}

	assert.Equal(t, d.Heads(), c.Heads())
}


================================================
FILE: dep/configuration.go
================================================
package dep

import (
	"fmt"

	"github.com/chewxy/lingo"
)

// describes the current state of the parser

type head int

const (
	DOES_NOT_EXIST head = iota - 1
)

// configuration is the meat of the shift-reduce parsing. It holds the state for the shift reduction
type configuration struct {
	*lingo.Dependency
	stack  []head
	buffer []head

	bp int // buffer pointer - starts at 0, increments
}

func newConfiguration(sentence lingo.AnnotatedSentence, fromGold bool) *configuration {
	if fromGold {
		sentence = sentence.Clone()
	}

	dep := lingo.NewDependency(lingo.FromAnnotatedSentence(sentence), lingo.AllocTree())
	dep.SetID()
	sentence = sentence[1:] // because the POSTagger automatically adds a ROOTTAG at the end of it

	var buffer []head
	for i := 1; i <= len(sentence); i++ {
		buffer = append(buffer, head(i))
	}

	var stack []head
	stack = append(stack, head(0)) // add root

	return &configuration{
		Dependency: dep,
		stack:      stack,
		buffer:     buffer,
	}
}

func (c *configuration) String() string {
	return fmt.Sprintf("Stack: %v Buffer(%d): %v", c.stack, c.bp, c.buffer[c.bp:])
}

func (c *configuration) GoString() string {
	return fmt.Sprintf("Stack: %v Buffer(%d): %v\nHeads: %v\nRels: %v\n", c.stack, c.bp, c.buffer[c.bp:], c.Heads(), c.Labels())
}

func (c *configuration) bufferSize() int {
	return len(c.buffer) - c.bp
}

func (c *configuration) stackSize() int {
	return len(c.stack)
}

func (c *configuration) head(i int) head {
	heads := c.Heads() // TODO: maybe some sanity checks?
	return head(heads[i])
}

// gets the sentence index of the ith word on the stack. If there isn't anything on the stack, it returns DOES_NOT_EXIST
func (c *configuration) stackValue(i int) head {
	size := c.stackSize()
	if i >= size || i < 0 {
		return DOES_NOT_EXIST
	}
	return c.stack[size-1-i]
}

func (c *configuration) bufferValue(i int) head {
	size := c.bufferSize()
	if i >= size {
		return DOES_NOT_EXIST
	}
	return c.buffer[i+c.bp]
}

/*  stack machinations */

// pop pops the stack. It isn't really used any more. removeStack(), removeTopStack() and removeSecondTopStack() has superseded its function
func (c *configuration) pop() head {
	retVal := c.stack[len(c.stack)-1]
	c.stack = c.stack[0 : len(c.stack)-1]
	return retVal
}

// removes a value from the stack.
func (c *configuration) removeStack(i int) {
	c.stack = c.stack[:i+copy(c.stack[i:], c.stack[i+1:])]
}

// removeSecondTopStack removes the 2nd-to-last element
func (c *configuration) removeSecondTopStack() bool {
	stackSize := c.stackSize()
	if stackSize < 2 {
		return false
	}
	i := stackSize - 2
	c.removeStack(i)
	return true
}

func (c *configuration) removeTopStack() bool {
	stackSize := c.stackSize()
	if stackSize < 1 {
		return false
	}
	i := stackSize - 1
	c.removeStack(i)
	return true
}

/* Dependency related stuff */

func (c *configuration) label(i head) lingo.DependencyType {
	if i < 0 {
		return lingo.NoDepType
	}

	if i == 0 {
		return lingo.NoDepType
	}

	return c.Label(int(i))
	// i--

	// labels := c.Labels()
	// return labels[i]
}

func (c *configuration) annotation(i head) *lingo.Annotation {
	if i < 0 {
		return lingo.NullAnnotation()
	}

	if i == 0 {
		return lingo.RootAnnotation()
	}
	// i--

	return c.Annotation(int(i))

	// return c.Sentence()[i]
}

// gets the jth left child of the ith word of a sentence
func (c *configuration) lc(k, cnt head) head {
	if k < 0 || int(k) > c.N() {
		return DOES_NOT_EXIST
	}

	cc := 0
	for i := 1; i < int(k); i++ {
		if c.Head(i) == int(k) {
			cc++
			if int(cnt) == cc {
				return head(i)
			}
		}
	}
	return DOES_NOT_EXIST
}

func (c *configuration) rc(k, cnt head) head {
	if k < 0 || int(k) > c.N() {
		return DOES_NOT_EXIST
	}

	cc := 0
	for i := c.N(); i > int(k); i-- {
		if c.Head(i) == int(k) {
			cc++
			if cc == int(cnt) {
				return head(i)
			}
		}
	}
	return DOES_NOT_EXIST
}

func (c *configuration) hasOtherChildren(i int, goldParse *lingo.Dependency) bool {
	for j := 1; j <= goldParse.N(); j++ {
		if goldParse.Head(j) == i && c.Head(j) != i {
			return true
		}
	}
	return false
}

func (c *configuration) isTerminal() bool {
	return c.stackSize() == 1 && c.bufferSize() == 0
}

// Actual Transitioning stuff
func (c *configuration) shift() bool {
	i := c.bufferValue(0)
	if i == DOES_NOT_EXIST {
		return false
	}

	c.bp++ // move the buffer pointer up

	c.stack = append(c.stack, i) // push to it.... gotta work the pop
	return true
}


================================================
FILE: dep/configuration_test.go
================================================
package dep

import (
	"testing"

	"github.com/chewxy/lingo"
	"github.com/stretchr/testify/assert"
)

func TestStackAppendRemove(t *testing.T) {
	sentence := mediumSentence()[0]
	as := sentence.AnnotatedSentence(dummyFix{})

	c := newConfiguration(as, true)
	t.Logf("C: %v", c)
	t.Logf("C: %#v", c)

	assert := assert.New(t)

	c.stack = append(c.stack, 200)
	assert.Equal([]head{0, 200}, c.stack, "stack is not equal after appending")

	c.removeTopStack()
	assert.Equal([]head{0}, c.stack, "stack is not equal after removeTopStack")

	c.stack = append(c.stack, 200)
	c.removeSecondTopStack()
	assert.Equal([]head{200}, c.stack, "stack is not equal after removeSecondTopStack()")

	correctHeads := []int{-1} // the -1 is the root
	correctHeads = append(correctHeads, sentence.Heads...)
	correctLabels := []lingo.DependencyType{lingo.Root}
	correctLabels = append(correctLabels, sentence.Labels...)

	dep := sentence.Dependency(dummyFix{})
	assert.Equal(correctHeads, dep.Heads(), "Heads are not equal")
	assert.Equal(correctLabels, dep.Labels(), "Labels are not equal %v \n %v", correctLabels, dep.Labels())
}

func TestConfiguration_StackValue(t *testing.T) {
	c := new(configuration)
	c.stack = []head{0, 1, 2, 5, 6}

	zero := c.stackValue(0)
	one := c.stackValue(1)
	four := c.stackValue(4)
	five := c.stackValue(5)
	negone := c.stackValue(-1)

	assert := assert.New(t)
	assert.Equal(head(6), zero, "Zeroth value not the same")
	assert.Equal(head(5), one, "First value not the same")
	assert.Equal(head(0), four, "Fourth value not the same")
	assert.Equal(DOES_NOT_EXIST, five, "Fifth value not the same")
	assert.Equal(DOES_NOT_EXIST, negone, "NegOne value not the same")

}


================================================
FILE: dep/debug.go
================================================
// +build debug

package dep

import (
	"bytes"
	"fmt"
	"log"
	"runtime"
	"strings"
	"sync/atomic"

	"github.com/chewxy/lingo"
)

const BUILD_DEBUG = "PARSER: DEBUG BUILD"
const BUILD_DIAG = "Diagnostic Build"

const DEBUG = true

var READMEMSTATS = true

var TABCOUNT uint32 = 0

func tabcount() int {
	return int(atomic.LoadUint32(&TABCOUNT))
}

func enterLoggingContext() {
	atomic.AddUint32(&TABCOUNT, 1)
	tc := tabcount()
	log.SetPrefix(strings.Repeat("\t", tc))
}

func leaveLoggingContext() {
	tc := tabcount()
	tc--

	if tc < 0 {
		atomic.StoreUint32(&TABCOUNT, 0)
		tc = 0
	} else {
		atomic.StoreUint32(&TABCOUNT, uint32(tc))
	}
	log.SetPrefix(strings.Repeat("\t", tc))
}

func logf(format string, others ...interface{}) {
	if !DEBUG {
		return
	}
	log.Printf(format, others...)
}

func logTrainingProgress(iteration, correct, total, length, possibles int) {
	if !DEBUG {
		return
	}

	log.Printf("Iteration %d. Correct/Total: %d/%d = %.2f", iteration, correct, total, float64(correct)/float64(total))
	log.Printf("DictSize: %d/%d, load factor of: %.2f", length, possibles, float64(length)/float64(possibles))
}

func logMemStats() {
	if !DEBUG || !READMEMSTATS {
		return
	}

	var mem runtime.MemStats
	runtime.ReadMemStats(&mem)

	log.Printf("Allocated          : %.2f MB", (float64(mem.Alloc)/1024)/float64(1024))
	log.Printf("Total Allocated    : %.2f MB", (float64(mem.TotalAlloc)/1024)/float64(1024))
	log.Printf("Heap Allocted      : %.2f MB", (float64(mem.HeapAlloc)/1024)/float64(1024))
	log.Printf("Sys Total Allocated: %.2f MB", (float64(mem.HeapSys)/1024)/float64(1024))
	log.Println("----------")
}

func recoverFrom(format string, attrs ...interface{}) {
	if r := recover(); r != nil {
		log.Printf(format, attrs...)
		panic(r)
	}
}

/* Nice output of shit */
func (d *Parser) SprintFeatures(features []int) string {
	// tabcount := int(atomic.LoadUint32(&TABCOUNT))

	var buf bytes.Buffer

	for i := 0; i < 18; i++ {
		number := features[i]
		id := number - wordFeatsStartAt
		word, _ := d.corpus.Word(id)

		if word == "" {
			word = "-NULL-"
		}

		buf.WriteString(fmt.Sprintf("%d, %q, %d \n", feature(i), word, number))
	}

	for i := 0; i < 18; i++ {
		number := features[i+18]

		buf.WriteString(fmt.Sprintf("%d, %v, %d\n", feature(i+18), lingo.POSTag(number), number))
	}

	for i := 0; i < 12; i++ {
		number := features[i+36]
		id := number - labelFeatsStartAt

		buf.WriteString(fmt.Sprintf("%d, %v, %d\n", feature(i+36), lingo.DependencyType(id), number))
	}

	return buf.String()
}

func SprintScores(scores []float64, ts []transition) string {
	var buf bytes.Buffer
	for i, v := range scores {
		if i >= len(ts) {
			buf.WriteString(fmt.Sprintf("UNKNOWN TRANSITION, %v\n", v))
			continue
		}
		buf.WriteString(fmt.Sprintf("%v, %v\n", ts[i], v))
	}
	return buf.String()
}

func SprintFloatSlice(a []float64) string {
	var buf bytes.Buffer
	buf.WriteString("[")
	for i, v := range a {
		if i < len(a)-1 {
			buf.WriteString(fmt.Sprintf("%v, ", v))
		} else {
			buf.WriteString(fmt.Sprintf("%v", v))
		}
	}
	buf.WriteString("]")
	return buf.String()
}


================================================
FILE: dep/dependencyParser.go
================================================
package dep

import (
	"fmt"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/corpus"
	"github.com/pkg/errors"
)

var KnownWords *corpus.Corpus // package provided global

// Parser is the object that performs the dependency parsing
// It contains a neural network, which is the core of it.
//
// The same object can be used to train the NN
type Parser struct {
	Input  chan lingo.AnnotatedSentence
	Output chan *lingo.Dependency
	Error  chan error

	*Model
}

// New creates a new Parser
func New(m *Model) *Parser {
	d := &Parser{
		Output: make(chan *lingo.Dependency),
		Error:  make(chan error),

		Model: m,
	}

	return d
}

// Run is used when using the NN to parse a sentence. For training, see Train()
func (d *Parser) Run() {
	defer close(d.Output)
	for sentence := range d.Input {
		dep, err := d.predict(sentence)

		if err != nil {
			d.Error <- err
			return
		}
		d.Output <- dep
	}
	return
}

func (d *Parser) predict(sentence lingo.AnnotatedSentence) (*lingo.Dependency, error) {
	// defer func() {
	// 	if r := recover(); r != nil {
	// 		log.Printf("Parsing for %q", sentence.ValueString())
	// 		panic(r)
	// 	}
	// }()
	c := newConfiguration(sentence, false)

	var err error
	var argmax int
	var count int
	for !c.isTerminal() && count < 100 {
		logf("%v", c)
		if count == 99 {
			logf("TARPIT")
		}

		features := getFeatures(c, d.corpus)
		// features2 := getFeatureArray(c, d.dict)

		if argmax, err = d.nn.pred(features); err != nil {
			return nil, err
		}
		// log.Printf("Argmax: %v, len(d.ts): %v, len(transitions) %v", argmax, len(d.ts), len(transitions))
		t := transitions[argmax] // no this is NOT a mistake
		if !c.canApply(t) {
			t = transition{Shift, lingo.NoDepType} // reset
			// manual argmaxing
			switch scores := d.nn.scores.Value().Data().(type) {
			case []float32:
				var maxScore float32
				for i, kt := range d.ts {
					if scores[i] > maxScore && c.canApply(kt) {
						maxScore = scores[i]
						t = kt
					}
				}
			case []float64:
				var maxScore float64
				for i, kt := range d.ts {
					if scores[i] > maxScore && c.canApply(kt) {
						maxScore = scores[i]
						t = kt
					}
				}
			default:
				return nil, errors.Errorf("Unhandled score type %T", d.nn.scores.Value())
			}

		}
		c.apply(t)

		count++
	}
	fix(c.Dependency)
	return c.Dependency, err
}

func (d *Parser) String() string {
	var nns, ds string

	if d.corpus != nil {
		ds = fmt.Sprintf("\nDict Size: %d words\nMAXTAG: %d\nMAXDEPTYPE: %d\n", d.corpus.Size(), lingo.MAXTAG, lingo.MAXDEPTYPE)
	} else {
		ds = "\n"
	}

	if d.nn != nil && d.nn.initialized() {
		nns = fmt.Sprintf("\nNeural Network:\n=================\n%v\n", d.nn)
	}

	if !d.nn.initialized() {
		panic(fmt.Sprintf("%v", d.nn))
	}

	base := "\n\nDependency Parser Info:\n=======================\n"
	return base + ds + nns
}


================================================
FILE: dep/documentation/iamhuman.dot
================================================
digraph G {
	Node_0xc425b88740->Node_0xc425b88780[ label=Root ];
	Node_0xc425b88780->Node_0xc425b88800[ label=Cop ];
	Node_0xc425b88780->Node_0xc425b887c0[ label=NSubj ];
	Node_0xc425b88740 [ label="0: &#34;-ROOT-/ROOT_TAG&#34;" ];
	Node_0xc425b88780 [ label="3: &#34;human/JJ&#34;" ];
	Node_0xc425b887c0 [ label="1: &#34;I/PRP&#34;" ];
	Node_0xc425b88800 [ label="2: &#34;am/VBP&#34;" ];

}

================================================
FILE: dep/documentation/thecatsatonthemat.dot
================================================
digraph G {
	Node_0xc4349eeec0->Node_0xc4349eef80[ label=Root ];
	Node_0xc4349eef80->Node_0xc4349eefc0[ label=NMod ];
	Node_0xc4349eefc0->Node_0xc4349ef040[ label=Det ];
	Node_0xc4349eef80->Node_0xc4349eef00[ label=NSubj ];
	Node_0xc4349eef00->Node_0xc4349eef40[ label=Det ];
	Node_0xc4349eefc0->Node_0xc4349ef000[ label=Case ];
	Node_0xc4349eeec0 [ label="0: &#34;-ROOT-/ROOT_TAG&#34;" ];
	Node_0xc4349eef00 [ label="2: &#34;cat/NN&#34;" ];
	Node_0xc4349eef40 [ label="1: &#34;the/DT&#34;" ];
	Node_0xc4349eef80 [ label="3: &#34;sat/VBD&#34;" ];
	Node_0xc4349eefc0 [ label="6: &#34;mat/NN&#34;" ];
	Node_0xc4349ef000 [ label="4: &#34;on/IN&#34;" ];
	Node_0xc4349ef040 [ label="5: &#34;the/DT&#34;" ];

}


================================================
FILE: dep/errors.go
================================================
package dep

import (
	"fmt"

	"github.com/chewxy/lingo"
)

type componentUnavailable string

func (c componentUnavailable) Error() string     { return fmt.Sprintf("%v unavailable", c) }
func (c componentUnavailable) Component() string { return string(c) }

// TarpitError is an error when the arc-standard is stuck.
// It implements GoStringer, which when called will output the state as a string.
// It also implements lingo.Sentencer, so the offending sentence can easily be retrieved
type TarpitError struct{ *configuration }

func (err TarpitError) Error() string { return "Tarpit Error" }

// NonProjective error is the error that is emitted when the dependency tree is not projective (that is to say the children cross lines)
type NonProjectiveError struct{ *lingo.Dependency }

func (err NonProjectiveError) Error() string { return "Non-projective tree" }


================================================
FILE: dep/evaluation.go
================================================
package dep

import (
	"fmt"
	"io/ioutil"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/treebank"
)

// Performance is a tuple that holds performance information from a training session
type Performance struct {
	Iter int     // which training iteration is this?
	UAS  float64 // Unlabelled Attachment Score
	LAS  float64 // Labeled Attachment Score
	UEM  float64 // Unlabelled Exact Match
	Root float64 // Correct Roots Ratio
}

func (p Performance) String() string {
	s := `EPO: %d
UAS: %.5f
LAS: %.5f
UEM: %.5f
ROO: %.5f`

	return fmt.Sprintf(s, p.Iter, p.UAS, p.LAS, p.UEM, p.Root)
}

// performance evaluation related code goes here

// Evaluate compares predicted trees with the gold standard trees and returns a Performance. It panics if the number of predicted trees and the number of gold trees aren't the same
func Evaluate(predictedTrees, goldTrees []*lingo.Dependency) Performance {
	if len(predictedTrees) != len(goldTrees) {
		panic(fmt.Sprintf("%d predicted trees; %d gold trees. Unable to compare", len(predictedTrees), len(goldTrees)))
	}

	var correctLabels, correctHeads, correctTrees, correctRoot, sumArcs float64
	var check int

	for i, tr := range predictedTrees {
		gTr := goldTrees[i]

		if len(tr.AnnotatedSentence) != len(gTr.AnnotatedSentence) {
			sumArcs += float64(gTr.N())

			// log.Printf("WARNING: %q and %q do not have the same length", tr, gTr)
			continue
		}

		var nCorrectHead int
		for j, a := range tr.AnnotatedSentence[1:] {
			b := gTr.AnnotatedSentence[j+1]
			if a.HeadID() == b.HeadID() {
				correctHeads++
				nCorrectHead++
			}

			if a.DependencyType == b.DependencyType {
				correctLabels++
			}
			sumArcs++
		}
		if nCorrectHead == gTr.N() {
			correctTrees++
		}
		if tr.Root() == gTr.Root() {
			correctRoot++
		}

		// check 5 per iteration
		if check < 5 {
			logf("predictedHeads: \n%v\n%v\n", tr.Heads(), gTr.Heads())
			logf("Ns: %v | %v || Correct: %v", tr.N(), gTr.N(), nCorrectHead)
			check++
		}
	}

	uas := correctHeads / sumArcs
	las := correctLabels / sumArcs
	uem := correctTrees / float64(len(predictedTrees))
	roo := correctRoot / float64(len(predictedTrees))

	return Performance{UAS: uas, LAS: las, UEM: uem, Root: roo}
}

func (t *Trainer) crossValidate(st []treebank.SentenceTag) Performance {
	preds := t.predMany(st)
	golds := make([]*lingo.Dependency, len(st))

	for i, s := range st {
		golds[i] = s.Dependency(t)
	}
	return Evaluate(preds, golds)
}

func (t *Trainer) predMany(sentenceTags []treebank.SentenceTag) []*lingo.Dependency {
	retVal := make([]*lingo.Dependency, len(sentenceTags))
	for i, st := range sentenceTags {
		dep, err := t.pred(st.AnnotatedSentence(t))
		if err != nil {
			ioutil.WriteFile("fullGraph.dot", []byte(t.nn.g.ToDot()), 0644)
			panic(fmt.Sprintf("%+v", err))
		}
		retVal[i] = dep
	}
	return retVal
}

func (t *Trainer) pred(as lingo.AnnotatedSentence) (*lingo.Dependency, error) {
	d := new(Parser)
	d.Model = t.Model

	return d.predict(as)
}


================================================
FILE: dep/example.go
================================================
package dep

import (
	"math/rand"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/corpus"
	"github.com/chewxy/lingo/treebank"
)

// example is a training example.
type example struct {
	transition

	features []int // features are used in the embeddings
	labels   []int // labels are used in scoring the transitions
}

func makeExamples(sentenceTags []treebank.SentenceTag, conf NNConfig, dict *corpus.Corpus, ts []transition, f lingo.AnnotationFixer) []example {
	var examples []example

	var tarpit, nonprojective, good int
	for i, sentenceTag := range sentenceTags {
		exs, err := makeOneExample(i, sentenceTag, dict, ts, f)
		if err != nil {
			switch err.(type) {
			case TarpitError:
				tarpit++
			case NonProjectiveError:
				nonprojective++
			}
		} else {
			examples = append(examples, exs...)
			good++
		}
	}

	logf("Number of SentenceTags Generated Into Examples: %d/%d | Number of Examples: %d | Number of nonprojective examples: %d | Number of tarpit examples: %d", good, len(sentenceTags), len(examples), nonprojective, tarpit)
	return examples
}

// makeOneExample is an example of a poorly named function. It makes an example from a SentenceTag
func makeOneExample(i int, sentenceTag treebank.SentenceTag, dict *corpus.Corpus, ts []transition, f lingo.AnnotationFixer) ([]example, error) {
	var examples []example

	s := sentenceTag.AnnotatedSentence(f)
	dep := s.Dependency()
	if dep.IsProjective() {
		c := newConfiguration(s, true)

		count := 0
		for !c.isTerminal() && count < 1000 {
			if count == 999 {
				return examples, TarpitError{c}
			}

			oracle := c.oracle(dep)
			features := getFeatures(c, dict)

			labels := make([]int, MAXTRANSITION)
			for i, t := range ts {
				if t == oracle {
					labels[i] = 1
				} else if c.canApply(t) {
					labels[i] = 0
				} else {
					labels[i] = -1
				}
			}

			ex := example{transition{oracle.Move, oracle.DependencyType}, features, labels}
			examples = append(examples, ex)

			c.apply(oracle)
			count++
		}
	} else {
		return nil, NonProjectiveError{dep}
	}

	return examples, nil
}

func shuffleExamples(a []example) {
	for i := range a {
		j := rand.Intn(i + 1)
		a[i], a[j] = a[j], a[i]
	}
}


================================================
FILE: dep/example_test.go
================================================
package dep

import (
	"testing"

	"github.com/chewxy/lingo/corpus"
)

func TestMakeExamples(t *testing.T) {
	st := simpleSentence()
	dict := corpus.GenerateCorpus(st)

	exs := makeExamples(st, DefaultNNConfig, dict, transitions, dummyFix{})
	if len(exs) != 20 {
		t.Error("Expected 20 examples to be generated from simple sentence")
	}
}


================================================
FILE: dep/featureExtraction.go
================================================
package dep

import (
	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/corpus"
)

// getFeatures extracts the IDs to pass into the neural network. These IDs are used in the network to construct the  input layers
func getFeatures(c *configuration, dict *corpus.Corpus) []int {
	// logf("CONFIG: %v", c)
	wordFeats := make([]int, 0)
	posFeats := make([]lingo.POSTag, 0)
	labelFeats := make([]lingo.DependencyType, 0)
	unknownID, _ := dict.Id("-UNKNOWN-")

	for j := 2; j >= 0; j-- {
		index := c.stackValue(j)
		mor := c.annotation(index)

		if wordID, ok := dict.Id(mor.Value); ok {
			wordFeats = append(wordFeats, wordID)
		} else {
			wordFeats = append(wordFeats, unknownID)
		}
		posFeats = append(posFeats, mor.POSTag)
	}

	// logf("wordFeats: %v", wordFeats)

	for j := 0; j <= 2; j++ {
		index := c.bufferValue(j)
		mor := c.annotation(index)
		// logf("Want: %v Index: %d. Morpheme: %v", j, index, mor)

		if wordID, ok := dict.Id(mor.Value); ok {
			wordFeats = append(wordFeats, wordID)
		} else {
			wordFeats = append(wordFeats, unknownID)
		}
		posFeats = append(posFeats, mor.POSTag)
	}
	// logf("wordFeats: %v", wordFeats)

	for j := 0; j <= 1; j++ {
		k := c.stackValue(j)

		index := c.lc(k, 1)
		mor := c.annotation(index)
		if wordID, ok := dict.Id(mor.Value); ok {
			wordFeats = append(wordFeats, wordID)
		} else {
			wordFeats = append(wordFeats, unknownID)
		}
		posFeats = append(posFeats, mor.POSTag)
		labelFeats = append(labelFeats, c.label(index))

		index = c.rc(k, 1)
		mor = c.annotation(index)
		if wordID, ok := dict.Id(mor.Value); ok {
			wordFeats = append(wordFeats, wordID)
		} else {
			wordFeats = append(wordFeats, unknownID)
		}
		posFeats = append(posFeats, mor.POSTag)
		labelFeats = append(labelFeats, c.label(index))

		index = c.lc(k, 2)
		mor = c.annotation(index)
		if wordID, ok := dict.Id(mor.Value); ok {
			wordFeats = append(wordFeats, wordID)
		} else {
			wordFeats = append(wordFeats, unknownID)
		}
		posFeats = append(posFeats, mor.POSTag)
		labelFeats = append(labelFeats, c.label(index))

		index = c.rc(k, 2)
		mor = c.annotation(index)
		if wordID, ok := dict.Id(mor.Value); ok {
			wordFeats = append(wordFeats, wordID)
		} else {
			wordFeats = append(wordFeats, unknownID)
		}
		posFeats = append(posFeats, mor.POSTag)
		labelFeats = append(labelFeats, c.label(index))

		leftChild := c.lc(k, 1)
		index = c.lc(leftChild, 1)
		mor = c.annotation(index)
		if wordID, ok := dict.Id(mor.Value); ok {
			wordFeats = append(wordFeats, wordID)
		} else {
			wordFeats = append(wordFeats, unknownID)
		}
		posFeats = append(posFeats, mor.POSTag)
		labelFeats = append(labelFeats, c.label(index))

		rightChild := c.rc(k, 1)
		index = c.rc(rightChild, 1)
		mor = c.annotation(index)
		if wordID, ok := dict.Id(mor.Value); ok {
			wordFeats = append(wordFeats, wordID)
		} else {
			wordFeats = append(wordFeats, unknownID)
		}
		posFeats = append(posFeats, mor.POSTag)
		labelFeats = append(labelFeats, c.label(index))
	}

	// the embedding matrix is arranged thus:
	/*
		POSTag0 0, 1, ... 50
		POSTag1
		...
		MAXTAG-1
		DepType0
		DepType1
		...
		MAXDEPTYPE-1
		WordID0
		...
		WordIDN
	*/

	features := make([]int, MAXFEATURE)

	for i, w := range wordFeats {
		features[i] = w + wordFeatsStartAt
	}
	for i, t := range posFeats {
		features[i+POS_OFFSET] = int(t)
	}
	for i, l := range labelFeats {
		features[i+DEP_OFFSET] = int(l) + labelFeatsStartAt
	}

	return features
}

const (
	POS_OFFSET   int = 18
	DEP_OFFSET       = 36
	STACK_OFFSET     = 6
	STACK_NUMBER     = 6
)


================================================
FILE: dep/features.go
================================================
package dep

import "github.com/chewxy/lingo"

// the features are used as columns in the matrix

// go:generate stringer type=feature -output=feature_string.go
type feature int

const (
	// first 18 are word related features
	// second 18 are POS related features
	// last 12 are label related features

	s0w feature = iota
	s1w
	s2w

	b0w
	b1w
	b2w

	s0l1w
	s0r1w
	s0l2w
	s0r2w
	s0llw
	s0rrw

	s1l1w
	s1r1w
	s1l2w
	s1r2w
	s1llw
	s1rrw

	// POS related words
	s0t
	s1t
	s2t

	b0t
	b1t
	b2t

	s0l1t
	s0r1t
	s0l2t
	s0r2t
	s0llt
	s0rrt

	s1l1t
	s1r1t
	s1l2t
	s1r2t
	s1llt
	s1rrt

	// label related
	s0l1d
	s0r1d
	s0l2d
	s0r2d
	s0lld
	s0rrd

	s1l1d
	s1r1d
	s1l2d
	s1r2d
	s1lld
	s1rrd

	MAXFEATURE
)

const (
	wordFeatsStartAt  int = int(lingo.MAXTAG) + int(lingo.MAXDEPTYPE)
	labelFeatsStartAt     = int(lingo.MAXTAG)
	posFeatsStartAt       = 0
)


================================================
FILE: dep/features_string.go
================================================
// generated by stringer -type=feature -output=features_string.go; DO NOT EDIT

package dep

import "fmt"

const _feature_name = "s0ws1ws2wb0wb1wb2ws0l1ws0r1ws0l2ws0r2ws0llws0rrws1l1ws1r1ws1l2ws1r2ws1llws1rrws0ts1ts2tb0tb1tb2ts0l1ts0r1ts0l2ts0r2ts0llts0rrts1l1ts1r1ts1l2ts1r2ts1llts1rrts0l1ds0r1ds0l2ds0r2ds0llds0rrds1l1ds1r1ds1l2ds1r2ds1llds1rrdMAXFEATURE"

var _feature_index = [...]uint8{0, 3, 6, 9, 12, 15, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68, 73, 78, 81, 84, 87, 90, 93, 96, 101, 106, 111, 116, 121, 126, 131, 136, 141, 146, 151, 156, 161, 166, 171, 176, 181, 186, 191, 196, 201, 206, 211, 216, 226}

func (i feature) String() string {
	if i < 0 || i >= feature(len(_feature_index)-1) {
		return fmt.Sprintf("feature(%d)", i)
	}
	return _feature_name[_feature_index[i]:_feature_index[i+1]]
}


================================================
FILE: dep/fix.go
================================================
package dep

import (
	"log"

	"github.com/chewxy/lingo"
)

// applies common fixes
func fix(d *lingo.Dependency) {
	// NNP fix:
	// If a sentence is [a, b, c, D, E, f, g]
	// where D, E are NNPs, they should be compound words
	// The head should be the one with higher headID
	spans := properNounSpans(d)
	for _, s := range spans {
		// we don't care about single word proper nouns
		if s.end-s.start <= 1 {
			continue
		}

		phrase := d.AnnotatedSentence[s.start:s.end]

		// pick up all compound roots
		// find annotations that do not have compound as deptype
		var compoundRoots lingo.AnnotationSet
		var problematic lingo.AnnotationSet
		for _, a := range phrase {
			if lingo.IsCompound(a.DependencyType) {
				compoundRoots = compoundRoots.Add(a.Head)
			}

			if !lingo.IsCompound(a.DependencyType) && a.ID != s.end-1 {
				problematic = problematic.Add(a)
			}
		}

		// if no root
		if len(compoundRoots) == 0 {
			// actual root is the word with the largest ID
			var compoundRoot *lingo.Annotation
			var rootRoot *lingo.Annotation
			for last := -1; s.end+last >= s.start; last-- {
				predictedRoot := s.end + last
				compoundRoot = d.AnnotatedSentence[predictedRoot]

				// incorrects :
				//	dep==Dep
				// 	dep==Root && others has dep != root

				if compoundRoot.DependencyType == lingo.Dep {
					problematic = problematic.Add(compoundRoot)
					continue
				}

				if compoundRoot.DependencyType != lingo.Dep && compoundRoot.DependencyType != lingo.Root {
					break
				}

				if compoundRoot.DependencyType == lingo.Root {
					rootRoot = compoundRoot
					problematic = problematic.Add(compoundRoot)
				}
			}

			if rootRoot != nil && rootRoot != compoundRoot {
				// we have two potential roots. Choose the best
				log.Println("Problem when fixing: more than one possible compound root found")
			}

			for _, a := range problematic {
				if a == compoundRoot {
					continue
				}
				tmpHead := a.Head
				tmpRel := a.DependencyType

				a.SetHead(compoundRoot)
				a.DependencyType = lingo.Compound

				for _, childID := range d.AnnotatedSentence.Children(a.ID) {
					childA := d.AnnotatedSentence[childID]
					childA.SetHead(tmpHead)
					childA.DependencyType = tmpRel
				}
			}

		}

		// if more than one root...
		logf("More than zero compound roots not handled yet")

	}

	// Number fix
}

func properNounSpans(d *lingo.Dependency) (retVal []span) {
	start := -1
	end := -1
	for i, a := range d.AnnotatedSentence {
		if lingo.IsProperNoun(a.POSTag) {
			if start == -1 {
				start = i
				end = i + 1
			} else {
				end = i + 1
			}
		} else {
			if end == -1 {
				end = i
			}

			if start > -1 {
				s := makeSpan(start, end)
				retVal = append(retVal, s)
			}

			start = -1
			end = -1
		}
	}

	if start > -1 {
		s := makeSpan(start, len(d.AnnotatedSentence))
		retVal = append(retVal, s)
	}
	return
}


================================================
FILE: dep/init.go
================================================
package dep

import "github.com/chewxy/lingo/corpus"

func init() {
	c := corpus.New()
	c.Add("") // add null words

	KnownWords = c
}


================================================
FILE: dep/models.go
================================================
package dep

import (
	"bufio"
	"bytes"
	"encoding/gob"
	"fmt"
	"io"
	"os"

	"github.com/chewxy/lingo/corpus"
	"github.com/pkg/errors"
	"gorgonia.org/tensor"
)

// Model holds the neural network that a DependencyParser uses. To train, use a Trainer
type Model struct {
	nn     *neuralnetwork2
	corpus *corpus.Corpus
	ts     []transition
}

func (m *Model) Corpus() *corpus.Corpus { return m.corpus }

func (m *Model) WordEmbeddings() *tensor.Dense {
	val := m.nn.e_w.Value().(*tensor.Dense)
	emb := val.Clone().(*tensor.Dense)
	return emb
}

func (m *Model) POSTagEmbeddings() *tensor.Dense {
	val := m.nn.e_t.Value().(*tensor.Dense)
	emb := val.Clone().(*tensor.Dense)
	return emb
}

func (m *Model) String() string {
	var buf bytes.Buffer
	buf.WriteString(m.nn.String())
	buf.WriteString("Transitions: [")
	for _, t := range m.ts {
		fmt.Fprintf(&buf, "%v, ", t)
	}
	buf.WriteString("]")
	return buf.String()
}

func (m *Model) Save(filename string) error {
	if m.nn == nil {
		return errors.Errorf("Cannot save a model with no nn")
	}

	f, err := os.Create(filename)
	if err != nil {
		return err
	}
	return m.SaveWriter(f)
}

func (m *Model) SaveWriter(f io.WriteCloser) error {
	defer f.Close()
	w := bufio.NewWriter(f)
	defer w.Flush()
	encoder := gob.NewEncoder(w)

	if err := encoder.Encode(m.corpus); err != nil {
		return err
	}

	if err := encoder.Encode(m.nn); err != nil {
		return err
	}

	// if err := encoder.Encode(m.ts); err != nil {
	// 	return err
	// }

	return nil
}

func Load(filename string) (*Model, error) {
	f, err := os.Open(filename)
	if err != nil {
		return nil, err
	}
	return LoadReader(f)
}

func LoadReader(rd io.ReadCloser) (*Model, error) {
	defer rd.Close()
	r := bufio.NewReader(rd)
	decoder := gob.NewDecoder(r)

	m := new(Model)
	if err := decoder.Decode(&m.corpus); err != nil {
		return nil, err
	}

	m.nn = new(neuralnetwork2)
	m.nn.dict = m.corpus

	if err := decoder.Decode(&m.nn); err != nil {
		return nil, err
	}

	if err := decoder.Decode(&m.ts); err != nil {
		m.ts = transitions
	}
	m.nn.transitions = m.ts

	return m, nil

}


================================================
FILE: dep/models_test.go
================================================
package dep

import (
	"os"
	"testing"

	"github.com/stretchr/testify/assert"
	G "gorgonia.org/gorgonia"
)

func TestModel_SaveLoad(t *testing.T) {
	assert := assert.New(t)

	testFileName := "TestSave.dat"
	m := new(Model)

	// dumb shit
	if err := m.Save(testFileName); err == nil {
		t.Error("Expected an error")
	}

	conf := DefaultNNConfig
	conf.Dtype = G.Float32
	m = new(Model)
	m.ts = transitions
	m.corpus = KnownWords

	m.nn = new(neuralnetwork2)
	m.nn.NNConfig = conf
	m.nn.dict = m.corpus

	if err := m.nn.init(); err != nil {
		t.Error(err)
	}

	if err := m.Save(testFileName); err != nil {
		t.Fatal(err)
	}

	var m2 *Model
	var err error
	if m2, err = Load(testFileName); err != nil {
		t.Error(err)

	}

	assert.Equal(m.corpus, m2.corpus, "Both Dependency Parsers need to have the same dict")

	if !G.ValueEq(m.nn.w2.Value(), m2.nn.w2.Value()) {
		t.Errorf("Expected w2 to be equal")
	}
	if !G.ValueEq(m.nn.e_w.Value(), m2.nn.e_w.Value()) {
		t.Errorf("Expected e_w to be equal")
	}

	// cleanup
	if err := os.Remove(testFileName); err != nil {
		t.Error(err)
	}
}


================================================
FILE: dep/move.go
================================================
package dep

// Move is an action that the dependency parser can take - whether to Shift, Attach-Left, or AttachRight
type Move byte

//go:generate stringer -type=Move

const (
	Shift Move = iota
	Left
	Right

	MAXMOVE
)

// ALLMOVES is the set of all possible moves
var ALLMOVES = [...]Move{Left, Right, Shift}


================================================
FILE: dep/move_string.go
================================================
// generated by stringer -type=Move; DO NOT EDIT

package dep

import "fmt"

const _Move_name = "ShiftLeftRightMAXMOVE"

var _Move_index = [...]uint8{0, 5, 9, 14, 21}

func (i Move) String() string {
	if i >= Move(len(_Move_index)-1) {
		return fmt.Sprintf("Move(%d)", i)
	}
	return _Move_name[_Move_index[i]:_Move_index[i+1]]
}


================================================
FILE: dep/nn2.go
================================================
package dep

import (
	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/corpus"
	"github.com/pkg/errors"
	G "gorgonia.org/gorgonia"
	"gorgonia.org/tensor"
)

// may is a simple monad for handling errors
type may struct {
	error
	n *G.Node
}

func (m *may) doUnary(fn func(*G.Node) (*G.Node, error)) {
	if m.error != nil {
		return
	}
	m.n, m.error = fn(m.n)
}

func (m *may) doBinary(fn func(a, b *G.Node) (*G.Node, error), other *G.Node) {
	if m.error != nil {
		return
	}
	m.n, m.error = fn(m.n, other)
}

func (m *may) doSwapBinary(fn func(a, b *G.Node) (*G.Node, error), other *G.Node) {
	if m.error != nil {
		return
	}
	m.n, m.error = fn(other, m.n)
}

type neuralnetwork2 struct {
	NNConfig

	g   *G.ExprGraph
	sub *G.ExprGraph

	// model

	// embedding matrices for word, POSTags and labels respectively
	e_w *G.Node // Shape: (EmbeddingSize, DictSize)
	e_t *G.Node // Shape: (EmbeddingSize, lingo.MAXTAG)
	e_l *G.Node // Shape: (EmbeddingSize, lingo.MAXDEP)

	// w1
	w1_w *G.Node // Shape: (HiddenSize, DictSize)
	w1_t *G.Node // Shape: (HiddenSize, lingo.MAXTAG)
	w1_l *G.Node // Shape: (HiddenSize, lingo.MAXDEP)
	b    *G.Node // Shape: (HiddenSize)

	// w2
	w2 *G.Node // Shape: (MAXTRANSITION, HiddenSize)

	// selects
	x_wSelW G.Nodes // 18 - word features
	x_tSelT G.Nodes // 18 - POSTag features
	x_lSelL G.Nodes // 12 - Dependency feature

	// inputs (feature vectors built up from the selects)
	x_w *G.Node
	x_t *G.Node
	x_l *G.Node

	// outputs
	scores  *G.Node // argmax this to get the greedy decoded transition
	logProb *G.Node
	cost    *G.Node
	costVal G.Value

	vm     G.VM
	model  G.Nodes
	solver G.Solver

	dict        *corpus.Corpus
	transitions []transition

	costChan chan G.Value

	// wordfeats *G.Node
	// tagfeats  *G.Node
	// depfeats  *G.Node
	// sumfeats  *G.Node
	// act       *G.Node
}

func (nn *neuralnetwork2) initialized() bool {
	return nn.g != nil && nn.sub != nil &&
		nn.e_w != nil && nn.e_t != nil && nn.e_l != nil &&
		nn.w1_w != nil && nn.w1_t != nil && nn.w1_l != nil && nn.b != nil &&
		nn.w2 != nil && len(nn.x_wSelW) > 0 && len(nn.x_tSelT) > 0 && len(nn.x_lSelL) > 0 &&
		nn.x_w != nil && nn.x_t != nil && nn.x_l != nil &&
		nn.scores != nil &&
		nn.dict != nil && nn.vm != nil && nn.solver != nil
}

func (nn *neuralnetwork2) init() error {
	if nn.dict == nil {
		return errors.Errorf("No Corpus Provided to the Neural Network. Will be unable to decode")
	}

	g := G.NewGraph()
	nn.g = g

	word := nn.dict.Size()
	tags := int(lingo.MAXTAG)
	deps := int(lingo.MAXDEPTYPE)
	// trns := len(nn.transitions)

	wordFeats := POS_OFFSET - 0
	tagFeats := DEP_OFFSET - POS_OFFSET
	depFeats := int(MAXFEATURE) - DEP_OFFSET

	// In any case a very very very small dict was passed in
	// we set the minimum to wordFeatss
	if word < wordFeats {
		word = wordFeats
	}

	logf(`Word: %d
tags: %d
deps: %d
wordFeats: %d
tagFeats: %d
depFeats: %d
`, word, tags, deps, wordFeats, tagFeats, depFeats)

	// define inputs
	nn.x_w = G.NewVector(g, nn.Dtype, G.WithShape(wordFeats*nn.EmbeddingSize), G.WithName("word input"), G.WithInit(G.Zeroes()))
	nn.x_t = G.NewVector(g, nn.Dtype, G.WithShape(tagFeats*nn.EmbeddingSize), G.WithName("POSTag input"), G.WithInit(G.Zeroes()))
	nn.x_l = G.NewVector(g, nn.Dtype, G.WithShape(depFeats*nn.EmbeddingSize), G.WithName("word input"), G.WithInit(G.Zeroes()))

	nn.x_wSelW = make(G.Nodes, wordFeats)
	nn.x_tSelT = make(G.Nodes, tagFeats)
	nn.x_lSelL = make(G.Nodes, depFeats)

	// define models
	nn.e_w = G.NewMatrix(g, nn.Dtype, G.WithShape(word, nn.EmbeddingSize), G.WithName("e_w"), G.WithInit(G.GlorotU(1)))
	nn.e_t = G.NewMatrix(g, nn.Dtype, G.WithShape(tags, nn.EmbeddingSize), G.WithName("e_t"), G.WithInit(G.GlorotU(1)))
	nn.e_l = G.NewMatrix(g, nn.Dtype, G.WithShape(deps, nn.EmbeddingSize), G.WithName("e_l"), G.WithInit(G.GlorotU(1)))

	nn.w1_w = G.NewMatrix(g, nn.Dtype, G.WithShape(nn.HiddenSize, nn.EmbeddingSize*wordFeats), G.WithName("w1_w"), G.WithInit(G.GlorotU(1)))
	nn.w1_t = G.NewMatrix(g, nn.Dtype, G.WithShape(nn.HiddenSize, nn.EmbeddingSize*tagFeats), G.WithName("w1_t"), G.WithInit(G.GlorotU(1)))
	nn.w1_l = G.NewMatrix(g, nn.Dtype, G.WithShape(nn.HiddenSize, nn.EmbeddingSize*depFeats), G.WithName("w1_l"), G.WithInit(G.GlorotU(1)))
	nn.b = G.NewVector(g, nn.Dtype, G.WithShape(nn.HiddenSize), G.WithName("b"), G.WithInit(G.Zeroes()))

	nn.w2 = G.NewMatrix(g, nn.Dtype, G.WithShape(MAXTRANSITION, nn.HiddenSize), G.WithName("w2"), G.WithInit(G.GlorotU(1)))

	nn.model = G.Nodes{nn.e_w, nn.e_t, nn.e_l, nn.w1_w, nn.w1_t, nn.w1_l, nn.b, nn.w2}

	// define selects
	// words first
	logf("nn.e_w: %+1.1s", nn.e_w.Value())
	var err error
	for i := 0; i < wordFeats; i++ {
		if nn.x_wSelW[i], err = G.Slice(nn.e_w, G.S(i)); err != nil { // dummy slices... they'll be replaced at runtime
			return err
		}

	}

	// tag features
	for i := 0; i < tagFeats; i++ {
		if nn.x_tSelT[i], err = G.Slice(nn.e_t, G.S(i)); err != nil { // dummy slices... they'll be replaced at runtime
			return err
		}
	}

	// dependency features
	for i := 0; i < depFeats; i++ {
		if nn.x_lSelL[i], err = G.Slice(nn.e_l, G.S(i)); err != nil {
			return err
		}
	}

	// forwards
	if err = nn.fwd(); err != nil {
		return err
	}

	// backprop
	if _, err = G.Grad(nn.cost, nn.model...); err != nil {
		return err
	}

	nn.sub = g.SubgraphRoots(nn.scores)

	// prog, locmap, err := G.Compile(nn.g)
	// if err != nil {
	// 	return err
	// }
	// log.Printf("Prog: %v", prog)

	// ioutil.WriteFile("graph.dot", []byte(g.ToDot()), 0644)

	// logger := log.New(os.Stderr, "", 0)
	// nn.vm = G.NewTapeMachine(prog, locmap, G.BindDualValues(nn.model...), G.UseCudaFor(), G.WithLogger(logger), G.WithWatchlist())
	// nn.vm = G.NewTapeMachine(prog, locmap, G.BindDualValues(nn.model...), G.UseCudaFor())
	nn.vm = G.NewTapeMachine(nn.g, G.BindDualValues(nn.model...), G.UseCudaFor())
	G.BindDualValues(nn.scores)(nn.vm) // makes sure that scores is a *dualValue
	nn.solver = G.NewAdaGradSolver(G.WithLearnRate(nn.AdaAlpha), G.WithEps(nn.AdaEps), G.WithL2Reg(nn.Reg), G.WithBatchSize(float64(nn.BatchSize)))
	// nn.solver = G.NewVanillaSolver(G.WithLearnRate(nn.AdaAlpha), G.WithL2Reg(nn.Reg))
	return nil
}

func (nn *neuralnetwork2) fwd() error {
	var err error

	// build up x vectors
	if nn.x_w, err = G.Concat(0, nn.x_wSelW...); err != nil {
		return err
	}

	if nn.x_t, err = G.Concat(0, nn.x_tSelT...); err != nil {
		return err
	}

	if nn.x_l, err = G.Concat(0, nn.x_lSelL...); err != nil {
		return err
	}

	logf("w1_w %v, x_w %v", nn.w1_w.Shape(), nn.x_w.Shape())
	m_w := &may{nil, nn.w1_w}
	m_w.doBinary(G.Mul, nn.x_w)
	if m_w.error != nil {
		return m_w.error
	}

	logf("w1_t %v, x_t %v", nn.w1_t.Shape(), nn.x_t.Shape())
	m_t := &may{nil, nn.w1_t}
	m_t.doBinary(G.Mul, nn.x_t)
	if m_t.error != nil {
		return m_t.error
	}

	logf("w1_l %v, x_l %v", nn.w1_l.Shape(), nn.x_l.Shape())
	m_l := &may{nil, nn.w1_l}
	m_l.doBinary(G.Mul, nn.x_l)
	if m_l.error != nil {
		return m_l.error
	}

	// add and activate layer 1
	logf("w : %v", m_w.n.Shape())
	m_w1 := &may{nil, m_w.n}
	m_w1.doBinary(G.Add, m_t.n)
	m_w1.doBinary(G.Add, m_l.n)
	m_w1.doBinary(G.Add, nn.b)
	m_w1.doUnary(G.Cube)
	if m_w1.error != nil {
		return m_w1.error
	}

	if nn.Dropout > 0 {
		logf("Doing dropout")
		m_w1.n, m_w1.error = G.Dropout(m_w1.n, nn.Dropout)
		if m_w1.error != nil {
			return m_w1.error
		}
	}

	// go to softmax layer
	logf("w2: %v, w1act: %v", nn.w2.Shape(), m_w1.n.Shape())
	m_sm := &may{nil, nn.w2}
	m_sm.doBinary(G.Mul, m_w1.n)
	nn.scores = m_sm.n
	m_sm.doUnary(G.SoftMax)
	if m_sm.error != nil {
		return m_sm
	}

	nn.logProb = m_sm.n
	// G.WithName("Logprob")(nn.logProb)
	// log.Printf("LOGPROB %v %p %v", nn.logProb, nn.logProb, nn.logProb)
	if nn.cost, err = G.Slice(nn.logProb, G.S(0)); err != nil { // slice is a dummy tensor.Slice. It'll be replaced at runtime
		return err
	}

	G.Read(nn.cost, &nn.costVal)
	return nil
}

func (nn *neuralnetwork2) costProgress() <-chan G.Value {
	if nn.costChan == nil {
		nn.costChan = make(chan G.Value)
	}
	return nn.costChan
}

// train does one epoch of training. The examples are batched.
func (nn *neuralnetwork2) train(examples []example) error {
	size := len(examples)
	batches := size / nn.BatchSize

	var start, end int
	if nn.BatchSize > size {
		batches = 1
		end = size
		G.WithBatchSize(float64(size))(nn.solver) // set it such that the solver doesn't get confused
	} else {
		end = nn.BatchSize
	}

	for batch := 0; batch < batches; batch++ {
		for _, ex := range examples[start:end] {
			nn.feats2vec(ex.features)
			tid := lookupTransition(ex.transition, nn.transitions)

			if err := G.UnsafeLet(nn.cost, G.S(tid)); err != nil {
				return err
			}

			if err := nn.vm.RunAll(); err != nil {
				return err
			}

			nn.vm.Reset()
		}
		if err := nn.solver.Step(G.NodesToValueGrads(nn.model)); err != nil {
			err = errors.Wrapf(err, "Stepping on the model failed %v", batch)
			return err
		}

		if nn.costChan != nil {
			nn.costChan <- nn.costVal
		}

		start = end
		if start >= size {
			break
		}
		end += nn.BatchSize
		if end >= size {
			end = size
		}
	}

	return nil
}

// pred predicts the index of the transitions
func (nn *neuralnetwork2) pred(ind []int) (int, error) {
	nn.feats2vec(ind)

	// f, _ := os.OpenFile("LOOOOOG", os.O_APPEND|os.O_CREATE|os.O_RDWR, 0644)
	// logger := log.New(f, "", 0)
	// logger := log.New(os.Stderr, "", 0)

	// m := G.NewLispMachine(nn.sub, G.ExecuteFwdOnly(), G.WithLogger(logger), G.WithWatchlist(), G.LogBothDir(), G.WithValueFmt("%+3.3v"))
	m := G.NewLispMachine(nn.sub, G.ExecuteFwdOnly())
	if err := m.RunAll(); err != nil {
		return 0, err
	}
	// logger.Println("========================\n")

	val := nn.scores.Value().(tensor.Tensor)
	t, err := tensor.Argmax(val, tensor.AllAxes)
	if err != nil {
		return 0, err
	}

	return t.ScalarValue().(int), nil
}

// utility function

func (nn *neuralnetwork2) feats2vec(indicators []int) error {
	// fix word features
	for i, ind := range indicators[:POS_OFFSET] {
		if err := G.UnsafeLet(nn.x_wSelW[i], G.S(ind-wordFeatsStartAt)); err != nil {
			return err
		}
	}

	// fix tag features
	for i, ind := range indicators[POS_OFFSET:DEP_OFFSET] {
		if err := G.UnsafeLet(nn.x_tSelT[i], G.S(ind)); err != nil {
			return err
		}
	}

	for i, ind := range indicators[DEP_OFFSET:] {
		if err := G.UnsafeLet(nn.x_lSelL[i], G.S(ind-labelFeatsStartAt)); err != nil {
			return err
		}
	}

	return nil
}


================================================
FILE: dep/nn2_io.go
================================================
package dep

import (
	"bytes"
	"encoding/gob"
	"fmt"

	"github.com/pkg/errors"
	G "gorgonia.org/gorgonia"
	T "gorgonia.org/tensor"
)

var empty struct{}

func (nn *neuralnetwork2) String() string {
	s := `Config
------
%v
Info
------
Embeddings_Word       : %v
Embeddings_POStag     : %v
Embeddings_Dependency : %v
Selects_Words         : %d
Selects_POSTag        : %d
Selects_Dependency    : %d
Weights1_Word         : %v
Weights1_POSTag       : %v
Weights1_Dependency   : %v
Biases                : %v
Weights2              : %v
`

	return fmt.Sprintf(s, nn.NNConfig,
		nn.e_w.Shape(), nn.e_t.Shape(), nn.e_l.Shape(),
		len(nn.x_wSelW), len(nn.x_tSelT), len(nn.x_lSelL),
		nn.w1_w.Shape(), nn.w1_t.Shape(), nn.w1_l.Shape(),
		nn.b.Shape(), nn.w2.Shape())
}

func (nn *neuralnetwork2) GobEncode() ([]byte, error) {
	if !nn.initialized() {
		return nil, errors.Errorf("Neural network not initialized. Cannot gob")
	}

	var buf bytes.Buffer
	encoder := gob.NewEncoder(&buf)

	if err := encoder.Encode(nn.NNConfig); err != nil {
		return nil, err
	}

	if err := encoder.Encode(nn.e_w.Value()); err != nil {
		return nil, err
	}

	if err := encoder.Encode(nn.e_t.Value()); err != nil {
		return nil, err
	}

	if err := encoder.Encode(nn.e_l.Value()); err != nil {
		return nil, err
	}

	if err := encoder.Encode(nn.w1_w.Value()); err != nil {
		return nil, err
	}

	if err := encoder.Encode(nn.w1_t.Value()); err != nil {
		return nil, err
	}

	if err := encoder.Encode(nn.w1_l.Value()); err != nil {
		return nil, err
	}

	if err := encoder.Encode(nn.b.Value()); err != nil {
		return nil, err
	}

	if err := encoder.Encode(nn.w2.Value()); err != nil {
		return nil, err
	}
	return buf.Bytes(), nil
}

func (nn *neuralnetwork2) GobDecode(buf []byte) error {
	// prechecks
	if nn.dict == nil {
		return errors.Errorf("Neural Network has no corpus attached to it (Corpuses are serialized separately).")
	}

	b := bytes.NewBuffer(buf)
	decoder := gob.NewDecoder(b)

	if err := decoder.Decode(&nn.NNConfig); err != nil {
		return err
	}

	if err := nn.init(); err != nil {
		return err
	}

	e_w := T.New(T.Of(nn.Dtype), T.WithShape(nn.e_w.Shape()...))
	if err := decoder.Decode(e_w); err != nil {
		return err
	}
	G.Let(nn.e_w, e_w)

	e_t := T.New(T.Of(nn.Dtype), T.WithShape(nn.e_t.Shape()...))
	if err := decoder.Decode(e_t); err != nil {
		return err
	}
	G.Let(nn.e_t, e_t)

	e_l := T.New(T.Of(nn.Dtype), T.WithShape(nn.e_l.Shape()...))
	if err := decoder.Decode(e_l); err != nil {
		return err
	}
	G.Let(nn.e_l, e_l)

	w1_w := T.New(T.Of(nn.Dtype), T.WithShape(nn.w1_w.Shape()...))
	if err := decoder.Decode(w1_w); err != nil {
		return err
	}
	G.Let(nn.w1_w, w1_w)

	w1_t := T.New(T.Of(nn.Dtype), T.WithShape(nn.w1_t.Shape()...))
	if err := decoder.Decode(w1_t); err != nil {
		return err
	}
	G.Let(nn.w1_t, w1_t)

	w1_l := T.New(T.Of(nn.Dtype), T.WithShape(nn.w1_l.Shape()...))
	if err := decoder.Decode(w1_l); err != nil {
		return err
	}
	G.Let(nn.w1_l, w1_l)

	bias := T.New(T.Of(nn.Dtype), T.WithShape(nn.b.Shape()...))
	if err := decoder.Decode(bias); err != nil {
		return err
	}
	G.Let(nn.b, bias)

	w2 := T.New(T.Of(nn.Dtype), T.WithShape(nn.w2.Shape()...))
	if err := decoder.Decode(w2); err != nil {
		return err
	}
	G.Let(nn.w2, w2)

	return nil
}


================================================
FILE: dep/nn2_io_test.go
================================================
package dep

import (
	"bytes"
	"encoding/gob"
	"fmt"
	"testing"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/corpus"
	G "gorgonia.org/gorgonia"
)

func TestNNIO(t *testing.T) {
	sts := allSentences()
	nn := new(neuralnetwork2)
	nn.NNConfig = DefaultNNConfig
	nn.dict = corpus.GenerateCorpus(sts)
	nn.transitions = transitions

	if err := nn.init(); err != nil {
		t.Fatalf("%+v", err)
	}

	s := `Config
------
Batch Size               : 10000
Dropout Rate             : 0.500000
AdaGrad Eps (ε)          : 0.000001
AdaGrad Learn Rate (η)   : 0.010000
Regularization Parameter : 0.000002
Hidden Layer Size        : 200
Embedding Size           : 50
Number Precomputed       : 30000

Evaluate Per 100 Iterations
Clear Gradients Per 0 Iterations
Dtype: float64

Info
------
Embeddings_Word       : (74, 50)
Embeddings_POStag     : (%d, 50)
Embeddings_Dependency : (%d, 50)
Selects_Words         : 18
Selects_POSTag        : 18
Selects_Dependency    : 12
Weights1_Word         : (200, 900)
Weights1_POSTag       : (200, 900)
Weights1_Dependency   : (200, 600)
Biases                : (200)
Weights2              : (%d, 200)
`

	correctDesc := fmt.Sprintf(s, lingo.MAXTAG, lingo.MAXDEPTYPE, MAXTRANSITION)
	if nn.String() != correctDesc {
		t.Errorf("Oops. Got %q. Want %q", nn.String(), correctDesc)
	}
	// nn.Dtype = tensor.Float32

	var buf bytes.Buffer
	encoder := gob.NewEncoder(&buf)
	if err := encoder.Encode(nn); err != nil {
		t.Fatalf("%+v", err)
	}

	decoder := gob.NewDecoder(&buf)
	nn2 := new(neuralnetwork2)
	nn2.dict = corpus.GenerateCorpus(sts)
	nn2.transitions = transitions
	if err := decoder.Decode(nn2); err != nil {
		t.Fatal(err)
	}

	if nn.String() != correctDesc {
		t.Fatalf("Oops. Got %q. Want %q", nn.String(), correctDesc)
	}

	if !G.ValueEq(nn.e_w.Value(), nn2.e_w.Value()) {
		t.Errorf("Expected e_w to be the same. Expected %1.1s. Got %1.1s", nn.e_w.Value(), nn2.e_w.Value())
	}

	if !G.ValueEq(nn.e_t.Value(), nn2.e_t.Value()) {
		t.Errorf("Expected e_t to be the same. Expected %1.1s. Got %1.1s", nn.e_t.Value(), nn2.e_t.Value())
	}

	if !G.ValueEq(nn.e_l.Value(), nn2.e_l.Value()) {
		t.Errorf("Expected e_l to be the same. Expected %1.1s. Got %1.1s", nn.e_l.Value(), nn2.e_l.Value())
	}

	if !G.ValueEq(nn.w1_w.Value(), nn2.w1_w.Value()) {
		t.Errorf("Expected w1_w to be the same. Expected %1.1s. Got %1.1s", nn.w1_w.Value(), nn2.w1_w.Value())
	}

	if !G.ValueEq(nn.w1_t.Value(), nn2.w1_t.Value()) {
		t.Errorf("Expected w1_t to be the same. Expected %1.1s. Got %1.1s", nn.w1_t.Value(), nn2.w1_t.Value())
	}

	if !G.ValueEq(nn.w1_l.Value(), nn2.w1_l.Value()) {
		t.Errorf("Expected w1_l to be the same. Expected %1.1s. Got %1.1s", nn.w1_l.Value(), nn2.w1_l.Value())
	}

	if !G.ValueEq(nn.b.Value(), nn2.b.Value()) {
		t.Errorf("Expected b to be the same. Expected %1.1s. Got %1.1s", nn.b.Value(), nn2.b.Value())
	}

	if !G.ValueEq(nn.w2.Value(), nn2.w2.Value()) {
		t.Errorf("Expected w2 to be the same. Expected %1.1s. Got %1.1s", nn.w2.Value(), nn2.w2.Value())
	}

	t.Logf("Visual Inspection: \n%+1.8s\n%+1.8s", nn.e_w.Value(), nn2.e_w.Value())

	// special case
	buf.Reset()
	encoder = gob.NewEncoder(&buf)
	if err := encoder.Encode(nn); err != nil {
		t.Fatalf("%+v", err)
	}
	decoder = gob.NewDecoder(&buf)
	nn3 := new(neuralnetwork2)
	if err := decoder.Decode(nn3); err == nil {
		t.Error("Expected a nocorpus error")
	}
}


================================================
FILE: dep/nn2_test.go
================================================
package dep

import (
	"math/rand"
	"testing"
	"time"

	"github.com/chewxy/lingo/corpus"
	"gorgonia.org/gorgonia"
)

func TestNN2(t *testing.T) {
	rand.Seed(1337)

	// we test 50 iterations unless the short flag is passed in
	epochs := 50
	if testing.Short() {
		epochs = 10
	}

	sts := allSentences()
	nn := new(neuralnetwork2)
	nn.NNConfig = DefaultNNConfig
	nn.Dtype = gorgonia.Float32
	nn.dict = corpus.GenerateCorpus(sts)
	nn.transitions = transitions

	if err := nn.init(); err != nil {
		t.Fatalf("%+v", err)
	}

	var costs []float64
	ch := nn.costProgress()
	sigChan := make(chan struct{})

	go func(ch <-chan gorgonia.Value, sig chan struct{}) {
		for cost := range ch {
			switch c := cost.Data().(type) {
			case float32:
				costs = append(costs, float64(c))
			case float64:
				costs = append(costs, c)
			}

			t.Logf("Cost %v", cost)
		}
		sig <- struct{}{}
	}(ch, sigChan)

	exs := makeExamples(sts, nn.NNConfig, nn.dict, transitions, dummyFix{})

	start := time.Now()
	for i := 0; i < epochs; i++ {
		if err := nn.train(exs); err != nil {
			t.Errorf("%+v", err)
		}
		shuffleExamples(exs)
	}
	// simulate what *DependencyParser would do
	close(nn.costChan)
	nn.costChan = nil

	t.Logf("Training %d iterations took Taken: %v", epochs, time.Since(start))

	<-sigChan
	if len(costs) == 0 {
		t.Error("Expected some costs")
	}
	if costs[0] <= costs[len(costs)-1] {
		t.Error("Expected costs to have reduced during training")
	}

	// PREDICTION TIME!

	ss2 := simpleSentence()
	exs = makeExamples(ss2, nn.NNConfig, nn.dict, transitions, dummyFix{})
	start = time.Now()
	for i, ex := range exs {
		ind, err := nn.pred(ex.features)
		if err != nil {
			t.Errorf("Example %d failed: %v", i, err)
			continue
		}

		t.Logf("Example %d. Want: %v. Got %v. Same: %t", i, ex.transition, transitions[ind], ex.transition == transitions[ind])
	}
	t.Logf("Pred Time Taken: %v", time.Since(start))
}


================================================
FILE: dep/nnconfig.go
================================================
package dep

import (
	"bytes"
	"encoding/gob"
	"fmt"

	"github.com/pkg/errors"
	"gorgonia.org/tensor"
)

// NNConfig configures the neural network
type NNConfig struct {
	BatchSize                  int     // 10000
	Dropout                    float64 // 0.5
	AdaEps                     float64 // 1e-6
	AdaAlpha                   float64 //0.02
	Reg                        float64 // 1e-8
	HiddenSize                 int     // 200
	EmbeddingSize              int     // 50
	NumPrecomputed             int     //100000
	EvalPerIteration           int     // 100
	ClearGradientsPerIteration int     // 0

	Dtype tensor.Dtype
}

func (c NNConfig) String() string {
	s := `Batch Size               : %d
Dropout Rate             : %f
AdaGrad Eps (ε)          : %f
AdaGrad Learn Rate (η)   : %f
Regularization Parameter : %f
Hidden Layer Size        : %d
Embedding Size           : %d
Number Precomputed       : %d

Evaluate Per %d Iterations
Clear Gradients Per %d Iterations
Dtype: %v
`
	return fmt.Sprintf(s, c.BatchSize, c.Dropout, c.AdaEps, c.AdaAlpha, c.Reg, c.HiddenSize, c.EmbeddingSize, c.NumPrecomputed, c.EvalPerIteration, c.ClearGradientsPerIteration, c.Dtype)
}

// DefaultNNConfig is the default config that is passed in, for initialization purposses.
var DefaultNNConfig NNConfig

func (c NNConfig) GobEncode() ([]byte, error) {
	var buf bytes.Buffer
	encoder := gob.NewEncoder(&buf)
	encoder.Encode(c.BatchSize)
	encoder.Encode(c.Dropout)
	encoder.Encode(c.AdaEps)
	encoder.Encode(c.AdaAlpha)
	encoder.Encode(c.Reg)
	encoder.Encode(c.HiddenSize)
	encoder.Encode(c.EmbeddingSize)
	encoder.Encode(c.NumPrecomputed)
	encoder.Encode(c.EvalPerIteration)
	encoder.Encode(c.ClearGradientsPerIteration)

	switch c.Dtype {
	case tensor.Float64:
		encoder.Encode(byte(0))
	case tensor.Float32:
		encoder.Encode(byte(1))
	default:
		return nil, errors.Errorf("Unsupported Dtype to be GobEncoded")
	}
	return buf.Bytes(), nil
}

func (c *NNConfig) GobDecode(p []byte) error {
	b := bytes.NewBuffer(p)
	decoder := gob.NewDecoder(b)

	decoder.Decode(&c.BatchSize)
	decoder.Decode(&c.Dropout)
	decoder.Decode(&c.AdaEps)
	decoder.Decode(&c.AdaAlpha)
	decoder.Decode(&c.Reg)
	decoder.Decode(&c.HiddenSize)
	decoder.Decode(&c.EmbeddingSize)
	decoder.Decode(&c.NumPrecomputed)
	decoder.Decode(&c.EvalPerIteration)
	decoder.Decode(&c.ClearGradientsPerIteration)

	var bite byte
	decoder.Decode(&bite)
	switch bite {
	case 0:
		c.Dtype = tensor.Float64
	case 1:
		c.Dtype = tensor.Float32
	default:
		return errors.Errorf("Unsupported Dtype to be GobDecoded: %v", bite)
	}
	return nil
}

func init() {
	DefaultNNConfig = NNConfig{
		BatchSize: 10000,
		Dropout:   0.5,

		AdaEps:   1e-6,
		AdaAlpha: 0.01,

		Reg: 1.5e-6,

		HiddenSize:     200,
		EmbeddingSize:  50,
		NumPrecomputed: 30000,

		EvalPerIteration:           100,
		ClearGradientsPerIteration: 0,

		Dtype: tensor.Float64,
		// Dtype: gorgonia.Float32,
	}
}


================================================
FILE: dep/release.go
================================================
// +build !debug

package dep

const BUILD_DEBUG = "PARSER: RELEASE BUILD"
const BUILD_DIAG = "Non-Diagnostic Build"

const DEBUG = false

var READMEMSTATS = false

var TABCOUNT uint32 = 0

func enterLoggingContext() {}

func leaveLoggingContext() {}

func logTrainingProgress(iteration, correct, total, length, possibles int) {}

func logMemStats() {}

func logf(format string, others ...interface{}) {}

func recoverFrom(format string, attrs ...interface{}) {}

func (d *Parser) SprintFeatures(feature []int) string { return "" }

func SprintScores(scores []float64, ts []transition) string { return "" }


================================================
FILE: dep/span.go
================================================
package dep

type span struct {
	start, end int
}

func makeSpan(start, end int) span {
	if end <= start {
		panic("Impossible span created")
	}
	return span{start, end}
}

func (s span) combine(other span) span {
	start := minInt(s.start, other.start)
	end := maxInt(s.end, other.end)
	return span{start, end}
}


================================================
FILE: dep/test_test.go
================================================
package dep

import (
	"bufio"
	"crypto/md5"
	"encoding/gob"
	"fmt"
	"io"
	"log"
	"os"
	"strings"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/treebank"
	"github.com/kljensen/snowball"
)

type dummyLem struct{}

func (dummyLem) Lemmatize(s string, pt lingo.POSTag) ([]string, error) {
	return nil, componentUnavailable("lemmatizer")
}

type dummyStemmer struct{}

func (dummyStemmer) Stem(s string) (string, error) {
	return snowball.Stem(s, "english", true)
}

type dummyFix struct {
	dummyStemmer
	dummyLem
}

func (dummyFix) Clusters() (map[string]lingo.Cluster, error) {
	return nil, componentUnavailable("clusters")
}

const nnps = `1	Guerrillas	guerrilla	NOUN	NNS	Number=Plur	2	nsubj	_	_
2	threatened	threaten	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	0	root	_	_
3	to	to	PART	TO	_	4	mark	_	_
4	assassinate	assassinate	VERB	VB	VerbForm=Inf	2	xcomp	_	_
5	Prime	Prime	PROPN	NNP	Number=Sing	6	compound	_	_
6	Minister	Minister	PROPN	NNP	Number=Sing	8	compound	_	_
7	Iyad	Iyad	PROPN	NNP	Number=Sing	8	compound	_	_
8	Allawi	Allawi	PROPN	NNP	Number=Sing	4	dobj	_	_
9	and	and	CONJ	CC	_	8	cc	_	_
10	Minister	Minister	PROPN	NNP	Number=Sing	14	compound	_	_
11	of	of	ADP	IN	_	12	case	_	_
12	Defense	Defense	PROPN	NNP	Number=Sing	10	nmod	_	_
13	Hazem	Hazem	PROPN	NNP	Number=Sing	14	compound	_	_
14	Shaalan	Shaalan	PROPN	NNP	Number=Sing	8	conj	_	_
15	in	in	ADP	IN	_	16	case	_	_
16	retaliation	retaliation	NOUN	NN	Number=Sing	4	nmod	_	_
17	for	for	ADP	IN	_	19	case	_	_
18	the	the	DET	DT	Definite=Def|PronType=Art	19	det	_	_
19	attack	attack	NOUN	NN	Number=Sing	16	nmod	_	_
20	.	.	PUNCT	.	_	2	punct	_	_

`
const simple = `1	Yet	yet	CONJ	CC	_	5	cc	_	_
2	we	we	PRON	PRP	Case=Nom|Number=Plur|Person=1|PronType=Prs	5	nsubj	_	_
3	did	do	AUX	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	5	aux	_	_
4	n't	not	PART	RB	_	5	neg	_	_
5	charge	charge	VERB	VB	VerbForm=Inf	0	root	_	_
6	them	they	PRON	PRP	Case=Acc|Number=Plur|Person=3|PronType=Prs	5	dobj	_	_
7	for	for	ADP	IN	_	9	case	_	_
8	the	the	DET	DT	Definite=Def|PronType=Art	9	det	_	_
9	evacuation	evacuation	NOUN	NN	Number=Sing	5	nmod	_	_
10	.	.	PUNCT	.	_	5	punct	_	_

`

const med = `1	President	President	PROPN	NNP	Number=Sing	2	compound	_	_
2	Bush	Bush	PROPN	NNP	Number=Sing	5	nsubj	_	_
3	on	on	ADP	IN	_	4	case	_	_
4	Tuesday	Tuesday	PROPN	NNP	Number=Sing	5	nmod	_	_
5	nominated	nominate	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	0	root	_	_
6	two	two	NUM	CD	NumType=Card	7	nummod	_	_
7	individuals	individual	NOUN	NNS	Number=Plur	5	dobj	_	_
8	to	to	PART	TO	_	9	mark	_	_
9	replace	replace	VERB	VB	VerbForm=Inf	5	advcl	_	_
10	retiring	retire	VERB	VBG	VerbForm=Ger	11	amod	_	_
11	jurists	jurist	NOUN	NNS	Number=Plur	9	dobj	_	_
12	on	on	ADP	IN	_	14	case	_	_
13	federal	federal	ADJ	JJ	Degree=Pos	14	amod	_	_
14	courts	court	NOUN	NNS	Number=Plur	11	nmod	_	_
15	in	in	ADP	IN	_	18	case	_	_
16	the	the	DET	DT	Definite=Def|PronType=Art	18	det	_	_
17	Washington	Washington	PROPN	NNP	Number=Sing	18	compound	_	_
18	area	area	NOUN	NN	Number=Sing	14	nmod	_	_
19	.	.	PUNCT	.	_	5	punct	_	_

`

const long = `1	Now	now	ADV	RB	_	5	advmod	_	_
2	,	,	PUNCT	,	_	5	punct	_	_
3	I	I	PRON	PRP	Case=Nom|Number=Sing|Person=1|PronType=Prs	5	nsubj	_	_
4	would	would	AUX	MD	VerbForm=Fin	5	aux	_	_
5	argue	argue	VERB	VB	VerbForm=Inf	0	root	_	_
6	that	that	SCONJ	IN	_	11	mark	_	_
7	one	one	PRON	PRP	_	11	nsubj	_	_
8	could	could	AUX	MD	VerbForm=Fin	11	aux	_	_
9	have	have	AUX	VB	VerbForm=Inf	11	aux	_	_
10	reasonably	reasonably	ADV	RB	_	11	advmod	_	_
11	predicted	predict	VERB	VBN	Tense=Past|VerbForm=Part	5	ccomp	_	_
12	that	that	SCONJ	IN	_	19	mark	_	_
13	some	some	DET	DT	_	14	det	_	_
14	form	form	NOUN	NN	Number=Sing	19	nsubj	_	_
15	of	of	ADP	IN	_	17	case	_	_
16	military	military	ADJ	JJ	Degree=Pos	17	amod	_	_
17	violence	violence	NOUN	NN	Number=Sing	14	nmod	_	_
18	was	be	VERB	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	19	cop	_	_
19	likely	likely	ADJ	JJ	Degree=Pos	11	ccomp	_	_
20	to	to	PART	TO	_	21	mark	_	_
21	occur	occur	VERB	VB	VerbForm=Inf	19	xcomp	_	_
22	in	in	ADP	IN	_	23	case	_	_
23	Lebanon	Lebanon	PROPN	NNP	Number=Sing	21	nmod	_	_
24	-LRB-	-lrb-	PUNCT	-LRB-	_	25	punct	_	_
25	considering	consider	VERB	VBG	VerbForm=Ger	19	advcl	_	_
26	that	that	SCONJ	IN	_	31	mark	_	_
27	the	the	DET	DT	Definite=Def|PronType=Art	28	det	_	_
28	country	country	NOUN	NN	Number=Sing	31	nsubj	_	_
29	has	have	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	31	aux	_	_
30	been	be	AUX	VBN	Tense=Past|VerbForm=Part	31	aux	_	_
31	experiencing	experience	VERB	VBG	Tense=Pres|VerbForm=Part	25	ccomp	_	_
32	some	some	DET	DT	_	33	det	_	_
33	form	form	NOUN	NN	Number=Sing	31	dobj	_	_
34	of	of	ADP	IN	_	35	case	_	_
35	conflict	conflict	NOUN	NN	Number=Sing	33	nmod	_	_
36	for	for	ADP	IN	_	41	case	_	_
37	approximately	approximately	ADV	RB	_	41	advmod	_	_
38	the	the	DET	DT	Definite=Def|PronType=Art	41	det	_	_
39	last	last	ADJ	JJ	Degree=Pos	41	amod	_	_
40	32	32	NUM	CD	NumType=Card	41	nummod	_	_
41	years	year	NOUN	NNS	Number=Plur	31	nmod	_	_
42	-RRB-	-rrb-	PUNCT	-RRB-	_	25	punct	_	_
43	.	.	PUNCT	.	_	5	punct	_	_

`

const cvconllu = `1	Google	Google	PROPN	NNP	Number=Sing	6	nsubj	_	_
2	is	be	VERB	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	6	cop	_	_
3	a	a	DET	DT	Definite=Ind|PronType=Art	6	det	_	_
4	nice	nice	ADJ	JJ	Degree=Pos	6	amod	_	_
5	search	search	NOUN	NN	Number=Sing	6	compound	_	_
6	engine	engine	NOUN	NN	Number=Sing	0	root	_	_
7	.	.	PUNCT	.	_	6	punct	_	_

1	Does	do	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	aux	_	_
2	anybody	anybody	NOUN	NN	Number=Sing	3	nsubj	_	_
3	use	use	VERB	VB	VerbForm=Inf	0	root	_	_
4	it	it	PRON	PRP	Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs	3	dobj	_	_
5	for	for	ADP	IN	_	6	case	_	_
6	anything	anything	NOUN	NN	Number=Sing	3	nmod	_	_
7	else	else	ADJ	JJ	Degree=Pos	6	amod	_	_
8	?	?	PUNCT	.	_	3	punct	_	_

`

func lotsaNNP() *lingo.Dependency {
	readr := strings.NewReader(nnps)
	sentenceTags := treebank.ReadConllu(readr)

	return sentenceTags[0].Dependency(dummyFix{})
}

// simpleSentence has 10 words
func simpleSentence() []treebank.SentenceTag {
	readr := strings.NewReader(simple)
	return treebank.ReadConllu(readr)
}

func mediumSentence() []treebank.SentenceTag {
	readr := strings.NewReader(med)
	return treebank.ReadConllu(readr)
}

// longSentence has 44 words
func longSentence() []treebank.SentenceTag {
	readr := strings.NewReader(long)
	return treebank.ReadConllu(readr)
}

func allSentences() []treebank.SentenceTag {
	sentenceTags := treebank.ReadConllu(strings.NewReader(nnps))
	sentenceTags = append(sentenceTags, treebank.ReadConllu(strings.NewReader(simple))...)
	sentenceTags = append(sentenceTags, treebank.ReadConllu(strings.NewReader(med))...)
	sentenceTags = append(sentenceTags, treebank.ReadConllu(strings.NewReader(long))...)
	return sentenceTags
}

func cvSentences() []treebank.SentenceTag {
	return treebank.ReadConllu(strings.NewReader(cvconllu))
}

func hash(s string) string {
	h := md5.New()
	io.WriteString(h, s)
	return fmt.Sprintf("%x", h.Sum(nil))
}

func cache(input string, s lingo.AnnotatedSentence) {
	hashfilename := "cached/" + hash(input) + ".cached"
	f, err := os.Create(hashfilename)
	if err != nil {
		log.Fatal(err)
	}
	defer f.Close()

	w := bufio.NewWriter(f)
	defer w.Flush()

	encoder := gob.NewEncoder(w)

	if err := encoder.Encode(s); err != nil {
		log.Fatal(err)
	}
}

func useCached(filename string) *lingo.Dependency {
	f, err := os.Open(filename)
	if err != nil {
		log.Fatal(err)
	}
	defer f.Close()

	r := bufio.NewReader(f)
	decoder := gob.NewDecoder(r)

	var sentence lingo.AnnotatedSentence
	if err := decoder.Decode(&sentence); err != nil {
		log.Fatal(err)
	}
	// fixes ID and what nots
	sentence.Fix()

	dep := sentence.Dependency()
	return dep
}


================================================
FILE: dep/train.go
================================================
package dep

import (
	"fmt"
	"os"
	"sync"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/corpus"
	"github.com/chewxy/lingo/treebank"
	"github.com/pkg/errors"
)

// TrainerConsOpt is a construction option for trainer
type TrainerConsOpt func(t *Trainer)

// WithTrainingModel loads a trainer with a model
func WithTrainingModel(m *Model) TrainerConsOpt {
	f := func(t *Trainer) {
		t.Model = m
	}
	return f
}

// WithTrainingSet creates a trainer with a training set
func WithTrainingSet(st []treebank.SentenceTag) TrainerConsOpt {
	f := func(t *Trainer) {
		t.trainingSet = st
	}
	return f
}

// WithCrossValidationSet creates a trainer with a cross validation set
func WithCrossValidationSet(st []treebank.SentenceTag) TrainerConsOpt {
	f := func(t *Trainer) {
		t.crossValSet = st
	}
	return f
}

// WithConfig sets up a *Trainer with a NNConfig
func WithConfig(conf NNConfig) TrainerConsOpt {
	f := func(t *Trainer) {
		t.nn.NNConfig = conf
		t.nn.dict = t.corpus
		t.nn.transitions = t.ts
		t.EvalPerIter = conf.EvalPerIteration
	}
	return f
}

// WithLemmatizer sets the lemmatizer option on the Trainer
func WithLemmatizer(l lingo.Lemmatizer) TrainerConsOpt {
	f := func(t *Trainer) {
		// cannot pass in itself!
		if T, ok := l.(*Trainer); ok && T == t {
			panic("Recursive definition of lemmatizer (trying to set the t.lemmatizer = T) !")
		}

		t.l = l
	}
	return f
}

// WithStemmer sets up the stemmer option on the DependencyParser
func WithStemmer(s lingo.Stemmer) TrainerConsOpt {
	f := func(t *Trainer) {
		// cannot pass in itself
		if T, ok := s.(*Trainer); ok && T == t {
			panic("Recursive setting of stemmer! (Trying to set t.stemmer = T)")
		}
		t.s = s
	}
	return f
}

// WithCluster sets the brown cluster options for the DependencyParser
func WithCluster(c map[string]lingo.Cluster) TrainerConsOpt {
	f := func(t *Trainer) {
		t.c = c
	}
	return f
}

// WithCorpus creates a Trainer with a corpus
func WithCorpus(c *corpus.Corpus) TrainerConsOpt {
	f := func(t *Trainer) {
		t.corpus = c
		t.nn.dict = c
	}
	return f
}

// WithGeneratedCorpus creates a Trainer's corpus from a list of SentenceTags. The corpus will be generated from the SentenceTags
func WithGeneratedCorpus(sts ...treebank.SentenceTag) TrainerConsOpt {
	f := func(t *Trainer) {
		dict := corpus.GenerateCorpus(sts)
		if t.corpus == nil {
			t.corpus = dict
		} else {
			t.corpus.Merge(dict)
		}

		t.nn.dict = t.corpus
	}
	return f
}

// Trainer trains a model
type Trainer struct {
	trainingSet []treebank.SentenceTag
	crossValSet []treebank.SentenceTag

	once sync.Once
	*Model

	// Training configuration
	EvalPerIter int    // for cross validation - evaluate results every n epochs
	PassDirect  bool   // Pass on the costs directly to the cost channel? If false, an average will be used
	SaveBest    string // SaveBest is the filename that will be saved. If it's empty then the best-while-training will not be saved

	// fixer
	l lingo.Lemmatizer
	s lingo.Stemmer
	c map[string]lingo.Cluster

	err  chan error
	cost chan float64
	perf chan Performance
}

// NewTrainer creates a new Trainer.
func NewTrainer(opts ...TrainerConsOpt) *Trainer {
	t := new(Trainer)
	// set up the default model
	t.Model = new(Model)
	t.corpus = KnownWords
	t.ts = transitions

	// set up the neural network
	t.nn = new(neuralnetwork2)
	t.nn.NNConfig = DefaultNNConfig
	t.nn.transitions = transitions
	t.nn.dict = KnownWords

	for _, opt := range opts {
		opt(t)
	}
	return t
}

// Lemmatize implemnets lingo.Lemmatizer
func (t *Trainer) Lemmatize(a string, pt lingo.POSTag) ([]string, error) {
	if t.l == nil {
		return nil, componentUnavailable("Lemmatizer")
	}
	return t.l.Lemmatize(a, pt)
}

// Stem implements lingo.Stemmer
func (t *Trainer) Stem(a string) (string, error) {
	if t.s == nil {
		return "", componentUnavailable("Stemmer")
	}
	return t.s.Stem(a)
}

// Clusters implements lingo.Fixer
func (t *Trainer) Clusters() (map[string]lingo.Cluster, error) {
	if t.c == nil {
		return nil, componentUnavailable("Clusters")
	}
	return t.c, nil
}

/* Getters */

// Cost returns a channel of costs for monitoring the training. If the PassDirect field in the trainer is set to true
// then the costs are directly returned. Otherwise the costs are averaged over the epoch.
func (t *Trainer) Cost() <-chan float64 {
	if t.cost == nil {
		t.cost = make(chan float64)
	}
	return t.cost
}

// Perf returns a channel of Performance for monitoring the training.
func (t *Trainer) Perf() <-chan Performance {
	if t.perf == nil {
		t.perf = make(chan Performance)
	}
	return t.perf
}

/* Methods */

// Init initializes the DependencyParser with a corpus and a neural network config
func (t *Trainer) Init() (err error) {
	f := func() {
		err = t.nn.init()
	}
	t.once.Do(f)
	return
}

// Train trains a model.
//
// If a cross validation set is provided, it will automatically train with the cross validation set
func (t *Trainer) Train(epochs int) error {
	if err := t.pretrainCheck(); err != nil {
		return err
	}
	if len(t.crossValSet) > 0 {
		return t.crossValidateTrain(epochs)
	}
	return t.train(epochs)
}

// TrainWithoutCrossValidation trains a model without cross validation.
func (t *Trainer) TrainWithoutCrossValidation(epochs int) error {
	return t.train(epochs)
}

// train simply trains the model without having a cross validation.
func (t *Trainer) train(epochs int) error {

	var epochChan chan struct{}
	if t.cost != nil {
		defer func() {
			close(t.cost)
			t.cost = nil
		}()

		epochChan = t.handleCosts()
		if epochChan != nil {
			defer close(epochChan)
		}
	}

	examples := makeExamples(t.trainingSet, t.nn.NNConfig, t.nn.dict, t.ts, t)

	for e := 0; e < epochs; e++ {
		if err := t.nn.train(examples); err != nil {
			return err
		}

		if epochChan != nil {
			epochChan <- struct{}{}
		}

		shuffleExamples(examples)
	}
	return nil
}

// crossValidateTrain trains the model but also does cross validation to ensure overfitting don't happen.
func (t *Trainer) crossValidateTrain(epochs int) error {
	if t.perf != nil {
		defer func() {
			close(t.perf)
			t.perf = nil
		}()
	}

	var epochChan chan struct{}
	if t.cost != nil {
		defer func() {
			close(t.cost)
			t.cost = nil
		}()

		epochChan = t.handleCosts()
		if epochChan != nil {
			defer close(epochChan)
		}
	}
	examples := makeExamples(t.trainingSet, t.nn.NNConfig, t.nn.dict, t.ts, t)

	var best Performance
	for e := 0; e < epochs; e++ {
		if err := t.nn.train(examples); err != nil {
			return err
		}

		if t.EvalPerIter > 0 && e%t.EvalPerIter == 0 || e == epochs-1 {
			perf := t.crossValidate(t.crossValSet)

			// if there is a channel to report back the performance, send it down
			if t.perf != nil {
				perf.Iter = e
				t.perf <- perf
			}

			if perf.UAS > best.UAS {
				best = perf

				if t.SaveBest != "" {
					f, err := os.Create(t.SaveBest)
					if err != nil {
						err = errors.Wrapf(err, "Unable to open SaveBest file %q", t.SaveBest)
						return err
					}

					t.Model.SaveWriter(f)
				}
			}
		}

		if epochChan != nil {
			epochChan <- struct{}{}
		}

		shuffleExamples(examples)
	}
	return nil
}

// pretrainCheck checks if everything is sane
func (t *Trainer) pretrainCheck() error {
	// check
	if t.nn == nil || !t.nn.initialized() {
		return errors.Errorf("DependencyParser not init()'d. Perhaps you forgot to call .Init() somewhere?")
	}

	if len(t.trainingSet) == 0 {
		return errors.Errorf("Cannot train with no training data set")
	}

	return nil
}

// handleCosts handles the costs from the neural network in two ways:
//		1. pass: directly passes on the costs (which may come from multiple batches in an epoch)
//		2. mean: calculates the mean of the costs and passes it on into d.cost
//
// If d.cost is nil, it simply returns. This method should be called after a check that d.cost is not nil
func (t *Trainer) handleCosts() (epochChan chan struct{}) {
	nncost := t.nn.costProgress()

	if t.PassDirect {
		go func() {
			for cost := range nncost {
				switch c := cost.Data().(type) {
				case float32:
					t.cost <- float64(c)
				case float64:
					t.cost <- c
				default:
					// this should NEVER happen
					panic(fmt.Sprintf("Unhandled cost type %T", c))
				}
			}
		}()
	} else {
		epochChan = make(chan struct{})

		// it collects the costs until the epoch chan signals that an epoch is done. Then the cost is averaged and sent down the d.cost channel
		go func(epochChan chan struct{}) {
			var collected []float64
			for {
				select {
				case cost := <-nncost:
					switch c := cost.Data().(type) {
					case float32:
						collected = append(collected, float64(c))
					case float64:
						collected = append(collected, c)
					default:
						// this should NEVER happen
						panic(fmt.Sprintf("Unhandled cost type %T", c))
					}
				case <-epochChan:
					var avg float64
					for _, cost := range collected {
						avg += cost
					}

					if len(collected) > 0 {
						avg /= float64(len(collected))
					}

					t.cost <- avg
					collected = collected[:0]
				}
			}
		}(epochChan)
	}
	return
}


================================================
FILE: dep/train_test.go
================================================
package dep

import (
	"testing"

	"github.com/chewxy/lingo/corpus"

	G "gorgonia.org/gorgonia"
)

func TestTrainerInitializations(t *testing.T) {
	var d *Trainer
	c := corpus.New()

	d = NewTrainer(WithCorpus(c))
	if d.corpus != c {
		t.Errorf("Expected Corpus to be set to %p. Got %p instead", c, d.corpus)
	}

	d = NewTrainer(WithConfig(DefaultNNConfig))
	if d.corpus != KnownWords {
		t.Error("Expected corpus to be set to the default KnownWords corpus")
	}
	if d.nn == nil {
		t.Fatal("Expected a neural network")
	}
	if d.nn.dict != KnownWords {
		t.Error("Expected neuralnetwork's dict to be set")
	}

	// d2 = d.Clone()
	// if d2.nn != d.nn {
	// 	t.Error("Expected a neural network!")
	// }

	// // init empty
	// d = New()
	// if err := d.Init(); err != nil {
	// 	t.Errorf("%+v", err)
	// }

	// // init with a corpus
	// d = New(WithCorpus(c))
	// if err := d.Init(); err != nil {
	// 	t.Errorf("%+v", err)
	// }
}

func TestTrainer_train(t *testing.T) {
	sts := allSentences()
	epochs := 10

	var err error

	trainer := NewTrainer(WithGeneratedCorpus(sts...), WithTrainingSet(sts))
	if err = trainer.Train(epochs); err == nil {
		t.Error("Expected an error when training an uninitialized Trainer")
	}

	// with init
	t.Logf("Pass On Costs Directly")
	conf := DefaultNNConfig
	conf.BatchSize = 90
	trainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts))
	if err := trainer.Init(); err != nil {
		t.Errorf("%+v", err)
	}
	trainer.PassDirect = true

	var costs []float64
	cost := trainer.Cost()

	go func() {
		for c := range cost {
			costs = append(costs, c)
			t.Logf("Cost %v", c)
		}
	}()

	if err = trainer.Train(epochs); err != nil {
		t.Errorf("Err: %v", err)
	}

	if len(costs) == 0 {
		t.Errorf("Zero costs...")
		goto avgcosts
	}

	t.Logf("Costs %d", len(costs))
	if len(costs) < (epochs*2)-5 { // we'll allow some tolerance
		t.Errorf("Expected some costs")
	}
	if costs[0] < costs[len(costs)-1] {
		t.Errorf("Costs should be reducing")
	}

avgcosts:
	// with init, avg costs
	t.Logf("Average Costs")
	costs = costs[:0] // reset
	conf = DefaultNNConfig
	conf.Dtype = G.Float32

	trainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts))
	if err := trainer.Init(); err != nil {
		t.Errorf("%+v", err)
	}
	trainer.PassDirect = false

	cost = trainer.Cost()

	go func() {
		for c := range cost {
			costs = append(costs, c)
			t.Logf("Cost %v", c)
		}
	}()
	if err = trainer.Train(epochs); err != nil {
		t.Errorf("%v", err)
	}

	if len(costs) == 0 {
		t.Fatal("Zero costs")
	}

	t.Logf("Costs %d", len(costs))
	if len(costs) == 0 {
		t.Errorf("Expected some costs")
	}

	if costs[0] < costs[len(costs)-1] {
		t.Errorf("Costs should be reducing")
	}
}

func TestTestTrainer_crossValidateTrain(t *testing.T) {
	sts := allSentences()
	cv := cvSentences()
	epochs := 10

	var trainer *Trainer
	var err error

	// uninit
	t.Logf("Uninitiated")
	trainer = NewTrainer(WithGeneratedCorpus(sts...))
	if err = trainer.Train(epochs); err == nil {
		t.Errorf("Expected an error when training with an uninitialized Trainer")
	}

	// with init
	t.Logf("Pass On Costs Directly")
	conf := DefaultNNConfig
	conf.BatchSize = 90
	trainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts), WithCrossValidationSet(cv))
	trainer.PassDirect = true
	if err := trainer.Init(); err != nil {
		t.Errorf("%+v", err)
	}

	var costs []float64
	cost := trainer.Cost()
	perf := trainer.Perf()

	go func() {
		for p := range perf {
			t.Logf("Perf \n%v", p)
		}
	}()

	go func() {
		for c := range cost {
			costs = append(costs, c)
			t.Logf("Cost %v", c)
		}
	}()
	if err = trainer.Train(epochs); err != nil {
		t.Error(err)
	}

	if len(costs) == 0 {
		t.Errorf("Zero costs")
		goto avgCosts
	}

	t.Logf("Costs %d", len(costs))
	if len(costs) < (epochs*2)-5 { // we'll allow some tolerance
		t.Errorf("Expected some costs")
	}
	if costs[0] < costs[len(costs)-1] {
		t.Errorf("Costs should be reducing")
	}

avgCosts:
	// with init, avg costs, and using float32
	t.Logf("Average Costs")
	costs = costs[:0] // reset
	conf = DefaultNNConfig
	conf.Dtype = G.Float32
	trainer = NewTrainer(WithGeneratedCorpus(sts...), WithConfig(conf), WithTrainingSet(sts), WithCrossValidationSet(cv))
	if err := trainer.Init(); err != nil {
		t.Errorf("%+v", err)
	}
	trainer.PassDirect = false

	cost = trainer.Cost()
	perf = trainer.Perf()

	go func() {
		for p := range perf {
			t.Logf("Perf \n%v", p)
		}
	}()

	go func() {
		for c := range cost {
			costs = append(costs, c)
			t.Logf("Cost %v", c)
		}
	}()
	trainer.Train(epochs)

	if len(costs) == 0 {
		t.Fatal("Zero costs")
	}

	t.Logf("Costs %d", len(costs))
	if len(costs) == 0 {
		t.Errorf("Expected some costs")
	}

	if costs[0] < costs[len(costs)-1] {
		t.Errorf("Costs should be reducing")
	}
}


================================================
FILE: dep/transition.go
================================================
package dep

import (
	"fmt"

	"github.com/chewxy/lingo"
)

// transition is a tuple of Move and label
type transition struct {
	Move
	lingo.DependencyType
}

var transitions []transition
var MAXTRANSITION int

func buildTransitions(labels []lingo.DependencyType) []transition {
	ts := make([]transition, 0)
	// for _, l := range labels {
	// 	if l == lingo.NoDepType {
	// 		continue
	// 	}
	// 	t := transition{Left, l}
	// 	ts = append(ts, t)
	// }

	// for _, l := range labels {
	// 	if l == lingo.NoDepType {
	// 		continue
	// 	}

	// 	t := transition{Right, l}
	// 	ts = append(ts, t)
	// }

	// ts = append(ts, transition{Shift, lingo.NoDepType})

	for _, m := range ALLMOVES {
		for _, l := range labels {
			if (m == Shift && l != lingo.NoDepType) || (m != Shift && l == lingo.NoDepType) {
				continue
			}
			t := transition{m, l}
			ts = append(ts, t)
		}
	}
	return ts
}

func (t transition) String() string {
	return fmt.Sprintf("(%s, %s)", t.Move, t.DependencyType)
}

func lookupTransition(t transition, table []transition) int {
	for i, v := range table {
		if v == t {
			return i
		}
	}
	panic(fmt.Sprintf("Transition %v not found", t))
}

// this builds the default transitions
func init() {
	lbls := make([]lingo.DependencyType, lingo.MAXDEPTYPE)

	for i := 0; i < int(lingo.MAXDEPTYPE); i++ {
		lbls[i] = lingo.DependencyType(i)
	}

	transitions = buildTransitions(lbls)
	MAXTRANSITION = len(transitions)
}


================================================
FILE: dep/util.go
================================================
package dep

func minInt(a, b int) int {
	if a < b {
		return a
	}
	return b
}

func maxInt(a, b int) int {
	if a > b {
		return a
	}
	return b
}


================================================
FILE: dependency.go
================================================
package lingo

import (
	"bytes"
	"fmt"
)

// Dependency represents the dependency parse of a sentence. While AnnotatedSentence does
// already do a job of representing the dependency parse of a sentence, *Dependency actually contains
// meta information about the dependency parse (specifically, lefts, rights) that makes parsing a dependency a lot faster
//
// The fields are mostly left unexported for a good reason - a dependency parse SHOULD be static after it's been built
type Dependency struct {
	AnnotatedSentence

	wordCount int

	lefts  [][]int
	rights [][]int

	counter int // for checking if a tree is projective

	n int
}

type depConsOpt func(*Dependency)

// FromAnnotatedSentence creates a dependency from an AnnotatedSentence.
func FromAnnotatedSentence(s AnnotatedSentence) depConsOpt {
	fn := func(d *Dependency) {
		wc := len(s)
		d.AnnotatedSentence = s
		d.wordCount = wc
		d.n = wc - 1
	}
	return fn
}

// AllocTree allocates the lefts and rights. Typical construction of the *Dependency doesn't allocate the trees as they're not necessary for a number of tasks.
func AllocTree() depConsOpt {
	fn := func(d *Dependency) {
		d.lefts = make([][]int, d.wordCount)
		d.rights = make([][]int, d.wordCount)
		for i := 0; i < d.wordCount; i++ {
			d.lefts[i] = make([]int, 0)
			d.rights[i] = make([]int, 0)
		}
	}
	return fn
}

// NewDependency creates a new *Dependency. It takes optional construction options:
//		FromAnnotatedSentence
//		AllocTree
func NewDependency(opts ...depConsOpt) *Dependency {
	d := new(Dependency)

	for _, opt := range opts {
		opt(d)
	}
	return d
}

func (d *Dependency) Sentence() AnnotatedSentence { return d.AnnotatedSentence }
func (d *Dependency) Lefts() [][]int              { return d.lefts }
func (d *Dependency) Rights() [][]int             { return d.rights }
func (d *Dependency) WordCount() int              { return d.wordCount }
func (d *Dependency) N() int                      { return d.n }

// please only use these for testing
func (d *Dependency) SetLefts(l [][]int)  { d.lefts = l }
func (d *Dependency) SetRights(r [][]int) { d.rights = r }

func (d *Dependency) Head(i int) int {
	if i < 0 || i >= d.wordCount || d.AnnotatedSentence[i].Head == nil {
		return -1
	}

	return d.AnnotatedSentence[i].HeadID()
}

func (d *Dependency) Label(i int) DependencyType {
	if i < 0 || i >= d.wordCount {
		return NoDepType
	}

	return d.AnnotatedSentence[i].DependencyType
}

func (d *Dependency) Annotation(i int) *Annotation {
	if i < 0 || i >= d.wordCount {
		return nullAnnotation
	}

	return d.AnnotatedSentence[i]
}

func (d *Dependency) AddArc(head, child int, label DependencyType) {
	d.AddChild(head, child)
	d.AddRel(child, label)
}

func (d *Dependency) AddChild(head, child int) {
	headAnn := d.AnnotatedSentence[head]
	d.AnnotatedSentence[child].SetHead(headAnn)

	if child < head {
		d.lefts[head] = append(d.lefts[head], child)
	} else {
		d.rights[head] = append(d.rights[head], child)
	}

	d.n++
}

func (d *Dependency) AddRel(child int, rel DependencyType) {
	// d.labels[child] = rel
	d.AnnotatedSentence[child].DependencyType = rel
}

func (d *Dependency) HasSingleRoot() bool {
	roots := 0
	for _, a := range d.AnnotatedSentence {
		h := a.HeadID()
		if h == 0 {
			roots++
		}
	}

	return roots == 1
}

func (d *Dependency) IsLegal() bool {
	var heads []int
	for _, a := range d.AnnotatedSentence {
		h := a.HeadID()
		if h < 0 || h > d.wordCount {
			return false
		}
		heads = append(heads, -1)
	}

	for i := 1; i < d.wordCount; i++ {
		for k := i; k > 0; {
			if heads[k] >= 0 && heads[k] < 1 {
				break
			}
			if heads[k] == i {
				return false
			}
			heads[k] = i
			k = d.AnnotatedSentence[k].HeadID()
		}
	}

	return true
}

func (d *Dependency) IsProjective() bool {
	d.counter = -1
	return d.projectiveVisit(0)
}

func (d *Dependency) projectiveVisit(w int) bool {
	for i := 1; i < w; i++ {
		if d.AnnotatedSentence[i].HeadID() == w && d.projectiveVisit(i) == false {
			return false
		}
	}

	d.counter++

	if w != d.counter {
		return false
	}

	for i := w + 1; i < d.wordCount; i++ {
		if d.AnnotatedSentence[i].HeadID() == w && d.projectiveVisit(i) == false {
			return false
		}
	}

	return true
}

func (d *Dependency) Root() int {
	for i := 1; i <= d.n; i++ {
		if d.Head(i) == 0 {
			return i
		}
	}

	return 0
}

func (d *Dependency) SprintRel() string {
	var buf bytes.Buffer

	for _, e := range d.Edges() {
		fmt.Fprintf(&buf, "%v(%q-%d, %q-%d)\n", e.Rel, e.Gov.Value, e.Gov.ID, e.Dep.Value, e.Dep.ID)
	}

	return buf.String()
}

type DependencyEdge struct {
	Gov *Annotation
	Dep *Annotation
	Rel DependencyType
}

// Sort interface

type edgeByID []DependencyEdge

func (b edgeByID) Len() int           { return len(b) }
func (b edgeByID) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
func (b edgeByID) Less(i, j int) bool { return b[i].Dep.ID < b[j].Dep.ID }


================================================
FILE: dependencyTree.go
================================================
package lingo

import (
	"github.com/awalterschulze/gographviz"

	"fmt"

	"sync"
)

// A DependencyTree is an alternate form of representing a dependency parse.
// This form makes it easier to traverse the tree
type DependencyTree struct {
	Parent *DependencyTree

	ID   int            // the word number in a sentence
	Type DependencyType // refers to the dependency type to the parent
	Word *Annotation

	Children []*DependencyTree
}

func NewDependencyTree(parent *DependencyTree, ID int, ann *Annotation) *DependencyTree {
	return &DependencyTree{
		Parent:   parent,
		ID:       ID,
		Word:     ann,
		Children: make([]*DependencyTree, 0),
	}
}

func (d *DependencyTree) AddChild(child *DependencyTree) {
	d.Children = append(d.Children, child)
}

func (d *DependencyTree) AddRel(rel DependencyType) {
	d.Type = rel
}

func (d *DependencyTree) walk(c chan *DependencyTree, wg *sync.WaitGroup) {
	defer wg.Done()

	for _, child := range d.Children {
		wg.Add(1)
		go child.walk(c, wg)
	}
	c <- d // man someone should do somehting about my bad naming
}

func (d *DependencyTree) Dot() string {
	// walk graph
	c := make(chan *DependencyTree)
	out := make(chan string)

	go dotString(c, out)
	var wg sync.WaitGroup
	wg.Add(1)
	go d.walk(c, &wg)

	wg.Wait()
	close(c)
	return <-out
}

func dotString(c chan *DependencyTree, out chan string) {
	g := gographviz.NewEscape()
	g.SetName("G")
	g.SetDir(true) // it's always going to be a directed graph
	// g.AddNode("G", "Node_0x0", nil) // add the root

	for t := range c {
		id := fmt.Sprintf("Node_%p", t)
		attrs := map[string]string{
			"label": fmt.Sprintf("%d: \"%s/%s\"", t.ID, t.Word.Value, t.Word.POSTag),
		}
		g.AddNode("G", id, attrs)

		if t.Parent == nil {
			continue
		}

		parentID := fmt.Sprintf("Node_%p", t.Parent)
		edgeAttrs := map[string]string{
			"label": fmt.Sprintf("%v", t.Type),
		}
		g.AddEdge(parentID, id, true, edgeAttrs)
	}
	out <- g.String()
}

func (d *DependencyTree) Walk(fn func(interface{})) {
	for _, child := range d.Children {
		child.Walk(fn)
	}

	if fn != nil {
		fn(d)
	}
}


================================================
FILE: dependencyType.go
================================================
package lingo

import (
	"fmt"
	"strings"
)

// DependencyType represents the relation between two words
type DependencyType byte

var dependencyTypeLookup map[string]DependencyType

func init() {
	dependencyTypeLookup = make(map[string]DependencyType)
	for dt := NoDepType; dt < MAXDEPTYPE; dt++ {
		s := dt.String()
		dependencyTypeLookup[s] = DependencyType(dt)
		dependencyTypeLookup[strings.ToLower(s)] = DependencyType(dt)
	}
}

func (dt DependencyType) MarshalText() ([]byte, error) {
	return []byte(fmt.Sprintf("%v", dt)), nil
}

func (dt *DependencyType) UnmarshalText(text []byte) error {
	str := strings.Trim(string(text), `"`) // for JSON use, if any
	deptype, _ := dependencyTypeLookup[str]
	*dt = deptype
	return nil
}

// list of dependency type functions

func InDepTypes(x DependencyType, set []DependencyType) bool {
	for _, v := range set {
		if v == x {
			return true
		}
	}
	return false
}

func IsModifier(x DependencyType) bool      { return InDepTypes(x, Modifiers) }
func IsCompound(x DependencyType) bool      { return InDepTypes(x, Compounds) }
func IsDeterminerRel(x DependencyType) bool { return InDepTypes(x, DeterminerRels) }
func IsMultiword(x DependencyType) bool     { return InDepTypes(x, MultiWord) }
func IsQuantifier(x DependencyType) bool    { return InDepTypes(x, QuantifingMods) }


================================================
FILE: dependencyType_stanford.go
================================================
// +build stanfordrel

package lingo

const BUILD_RELSET = "stanfordrel"

//go:generate stringer -type=DependencyType -output=dependencyType_stanford_string.go

// http://nlp.stanford.edu/software/dependencies_manual.pdf
const (
	NoDepType DependencyType = iota
	Dep
	Root
	Aux           // Auxilliary
	AuxPass       // passive auxiliary
	Cop           // Copula
	Arg           // argument
	Agent         // agent
	Comp          // Complement
	AComp         // adjectival complement
	CComp         // clausal complement with internal subject
	XComp         // clausal complement with external subject
	Obj           // Object
	DObj          // Direct Object
	IObj          // Indirect Object
	PObj          // Object of preposition
	Subj          // subject
	NSubj         // Nominal Subject
	NSubjPass     // passive nominal subject
	CSubj         // clausal subject
	CSubjPass     // passive clausal subject
	Coordination  // coordination (cannot use CC, as it's a POSTag)
	Conj          // conjunction
	Expl          // Expletive
	Mod           // modifier
	AMod          // adjectival modifier
	Appos         // Appositional modifier
	Advcl         // adverbial clause modifier
	Det           // determiner
	Predet        // predeterminer
	Preconj       // Preconjunction
	Vmod          // reduced, nonfinite verbal modifier
	MWE           // multiword expression modifier
	Mark          // marker (word introducing an Advcl or CComp)
	AdvMod        // adverbial modifier
	Neg           // negation modifier
	RCMod         // relative clause modifier
	QuantMod      // quantifier modifier
	NounMod       // Noun Compound Modifier (cannot use NN because NN is defined as a POSTag)
	NPAdvMod      // Noun phrase adverbial modifier
	TMod          // temporal modifier
	Num           // Numeric Modifier
	NumberElement // element of compound number (cannot use Number because Number is defined as a LexemeType)
	Prep          // prepositional modifier
	Poss          // possession modifier
	Possessive    // possessive modifier ('s)
	PRT           // phrasal verb partical
	Parataxis     // Parataxis (words that are next to each other)
	GoesWith      // GoesWith
	Punct         // punctuation
	Ref           // referant
	SDep          // Semantic Dependent
	XSubj         // controlling subject

	// additional stuff not found in the original, but found in EWT
	Case
	Compound
	NMod
	Discourse
	NumMod
	RelCl
	NFinCl
	NMod_Poss
	NMod_NPMod
	Vocative
	List
	MWPrep // multiword prepositional modifier
	Remnant
	Acl
	NPMod
	MDVod
	DetMod

	// found in stanford nnparser SD models
	PComp

	MAXDEPTYPE
)

var Modifiers = []DependencyType{AMod}
var Compounds = []DependencyType{Compound}
var DeterminerRels = []DependencyType{Det, DetMod}
var MultiWord = []DependencyType{MWE, MWPrep, Compound, Parataxis}
var QuantifingMods = []DependencyType{QuantMod, NumMod}


================================================
FILE: dependencyType_stanford_string.go
================================================
// +build stanfordrel

// Code generated by "stringer -type=DependencyType -output=dependencyType_stanford_string.go"; DO NOT EDIT

package lingo

import "fmt"

const _DependencyType_name = "NoDepTypeDepRootNSubjNSubjPassDObjIObjCSubjCSubjPassCCompXCompNumModApposNModAClACl_RelClDetDet_PreDetAModNegCaseNMod_NPModNMod_TModNMod_PossAdvClAdvModCompoundCompound_PartMWEListParataxisDiscourseExplAuxAuxPassCopMarkPunctConjCoordinationCC_PreConjMAXDEPTYPE"

var _DependencyType_index = [...]uint16{0, 9, 12, 16, 21, 30, 34, 38, 43, 52, 57, 62, 68, 73, 77, 80, 89, 92, 102, 106, 109, 113, 123, 132, 141, 146, 152, 160, 173, 176, 180, 189, 198, 202, 205, 212, 215, 219, 224, 228, 240, 250, 260}

func (i DependencyType) String() string {
	if i >= DependencyType(len(_DependencyType_index)-1) {
		return fmt.Sprintf("DependencyType(%d)", i)
	}
	return _DependencyType_name[_DependencyType_index[i]:_DependencyType_index[i+1]]
}


================================================
FILE: dependencyType_universal.go
================================================
// +build !stanfordrel

package lingo

const BUILD_RELSET = "universalrel"

//go:generate stringer -type=DependencyType -output=dependencyType_universal_string.go

// http://universaldependencies.github.io/docs/en/dep/all.html
const (
	NoDepType DependencyType = iota
	Dep
	Root

	// Core dependents of clausal predicates

	// nominal dependencies
	NSubj
	NSubjPass
	DObj
	IObj

	// predicate dependencies
	CSubj
	CSubjPass
	CComp

	XComp

	// Noun dependents

	// nominal dependencies
	NumMod
	Appos
	NMod

	// predicate dependencies
	ACl
	ACl_RelCl // RCMod in stanford deps
	Det
	Det_PreDet

	// modifier word
	AMod
	Neg

	// Case Marking, preposition, possessive
	Case

	//Non-Core Dependents of Clausal Predicates

	// Nominal dependencies
	NMod_NPMod
	NMod_TMod
	NMod_Poss

	// Predicate Dependencies
	AdvCl

	// Modifier Word
	AdvMod

	// Compounding and Unanalyzed
	Compound
	Compound_Part
	Name // Unused in English
	MWE
	Foreign  // Unused in English
	GoesWith // Unused in English

	// Loose Joining Relations
	List
	Dislocated // Unused in English
	Parataxis
	Remnant    // Unused in English
	Reparandum // Unused in English

	// Special Clausal Dependents

	// Nominal Dependent
	Vocative // Unused in English
	Discourse
	Expl

	// Auxilliary
	Aux
	AuxPass
	Cop

	// Other
	Mark
	Punct

	// Coordination

	Conj
	Coordination // CC
	CC_PreConj

	MAXDEPTYPE
)

var Modifiers = []DependencyType{AMod}
var Compounds = []DependencyType{Compound, Compound_Part}
var DeterminerRels = []DependencyType{Det, Det_PreDet}
var MultiWord = []DependencyType{MWE, Compound, Compound_Part, Parataxis}
var QuantifingMods = []DependencyType{NumMod}


================================================
FILE: dependencyType_universal_string.go
================================================
// +build !stanfordrel

// Code generated by "stringer -type=DependencyType -output=dependencyType_universal_string.go"; DO NOT EDIT

package lingo

import "fmt"

const _DependencyType_name = "NoDepTypeDepRootNSubjNSubjPassDObjIObjCSubjCSubjPassCCompXCompNumModApposNModAClACl_RelClDetDet_PreDetAModNegCaseNMod_NPModNMod_TModNMod_PossAdvClAdvModCompoundCompound_PartNameMWEForeignGoesWithListDislocatedParataxisRemnantReparandumVocativeDiscourseExplAuxAuxPassCopMarkPunctConjCoordinationCC_PreConjMAXDEPTYPE"

var _DependencyType_index = [...]uint16{0, 9, 12, 16, 21, 30, 34, 38, 43, 52, 57, 62, 68, 73, 77, 80, 89, 92, 102, 106, 109, 113, 123, 132, 141, 146, 152, 160, 173, 177, 180, 187, 195, 199, 209, 218, 225, 235, 243, 252, 256, 259, 266, 269, 273, 278, 282, 294, 304, 314}

func (i DependencyType) String() string {
	if i >= DependencyType(len(_DependencyType_index)-1) {
		return fmt.Sprintf("DependencyType(%d)", i)
	}
	return _DependencyType_name[_DependencyType_index[i]:_DependencyType_index[i+1]]
}


================================================
FILE: errors.go
================================================
package lingo

type componentUnavailable interface {
	error
	Component() string
}


================================================
FILE: go.mod
================================================
module github.com/chewxy/lingo

require (
	github.com/abiosoft/ishell v2.0.0+incompatible
	github.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db // indirect
	github.com/awalterschulze/gographviz v0.0.0-20190221210632-1e9ccb565bca
	github.com/chewxy/hm v1.0.0 // indirect
	github.com/chewxy/math32 v1.0.0 // indirect
	github.com/chzyer/logex v1.1.10 // indirect
	github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 // indirect
	github.com/davecgh/go-spew v1.1.1 // indirect
	github.com/fatih/color v1.7.0 // indirect
	github.com/flynn-archive/go-shlex v0.0.0-20150515145356-3f9db97f8568 // indirect
	github.com/gogo/protobuf v1.2.1 // indirect
	github.com/golang/protobuf v1.2.0 // indirect
	github.com/google/flatbuffers v1.10.0 // indirect
	github.com/kljensen/snowball v0.6.0
	github.com/leesper/go_rng v0.0.0-20171009123644-5344a9259b21 // indirect
	github.com/mattn/go-colorable v0.1.1 // indirect
	github.com/mattn/go-isatty v0.0.6 // indirect
	github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4
	github.com/pkg/errors v0.8.1
	github.com/stretchr/testify v1.3.0
	github.com/xtgo/set v1.0.0
	golang.org/x/exp v0.0.0-20190221220918-438050ddec5e // indirect
	golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 // indirect
	golang.org/x/sys v0.0.0-20190225065934-cc5685c2db12 // indirect
	golang.org/x/text v0.3.0
	gonum.org/v1/gonum v0.0.0-20190221132855-8ea67971a689 // indirect
	gonum.org/v1/netlib v0.0.0-20190221094214-0632e2ebbd2d // indirect
	gorgonia.org/cu v0.9.0-beta // indirect
	gorgonia.org/dawson v1.1.0 // indirect
	gorgonia.org/gorgonia v0.9.1
	gorgonia.org/tensor v0.9.0-beta
	gorgonia.org/vecf32 v0.7.0 // indirect
	gorgonia.org/vecf64 v0.7.0 // indirect
)

go 1.13


================================================
FILE: go.sum
================================================
github.com/abiosoft/ishell v2.0.0+incompatible/go.mod h1:HQR9AqF2R3P4XXpMpI0NAzgHf/aS6+zVXRj14cVk9qg=
github.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db/go.mod h1:rB3B4rKii8V21ydCbIzH5hZiCQE7f5E9SzUb/ZZx530=
github.com/awalterschulze/gographviz v0.0.0-20190221210632-1e9ccb565bca h1:xwIXr1FpA2XBoohlpvgb11No/zbsh5Clm/98PWPcHVA=
github.com/awalterschulze/gographviz v0.0.0-20190221210632-1e9ccb565bca/go.mod h1:GEV5wmg4YquNw7v1kkyoX9etIk8yVmXj+AkDHuuETHs=
github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
github.com/chewxy/math32 v1.0.0 h1:RTt2SACA7BTzvbsAKVQJLZpV6zY2MZw4bW9L2HEKkHg=
github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
github.com/chzyer/logex v1.1.10 h1:Swpa1K6QvQznwJRcfTfQJmTE72DqScAa40E+fbHEXEE=
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 h1:q763qf9huN11kDQavWsoZXJNW3xEE4JJyHa5Q25/sd8=
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
github.com/flynn-archive/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:rZfgFAXFS/z/lEd6LJmf9HVZ1LkgYiHx5pHhV5DR16M=
github.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE=
github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/google/flatbuffers v1.10.0 h1:wHCM5N1xsJ3VwePcIpVqnmjAqRXlR44gv4hpGi+/LIw=
github.com/google/flatbuffers v1.10.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/kljensen/snowball v0.6.0/go.mod h1:27N7E8fVU5H68RlUmnWwZCfxgt4POBJfENGMvNRhldw=
github.com/leesper/go_rng v0.0.0-20171009123644-5344a9259b21 h1:O75p5GUdUfhJqNCMM1ntthjtJCOHVa1lzMSfh5Qsa0Y=
github.com/leesper/go_rng v0.0.0-20171009123644-5344a9259b21/go.mod h1:N0SVk0uhy+E1PZ3C9ctsPRlvOPAFPkCNlcPBDkt0N3U=
github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ=
github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
github.com/mattn/go-isatty v0.0.6/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4/go.mod h1:4OwLy04Bl9Ef3GJJCoec+30X3LQs/0/m4HFRt/2LUSA=
github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190221220918-438050ddec5e h1:dVreTP5bOOWt5GFwwvgTE2iU0TkIqi2x3r0b8qGlp6k=
golang.org/x/exp v0.0.0-20190221220918-438050ddec5e/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 h1:YUO/7uOKsKeq9UokNS62b8FYywz3ker1l1vDZRCRefw=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190225065934-cc5685c2db12 h1:Zw7eRv6INHGfu15LVRN1vrrwusJbnfJjAZn3D1VkQIE=
golang.org/x/sys v0.0.0-20190225065934-cc5685c2db12/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gonum.org/v1/gonum v0.0.0-20190221132855-8ea67971a689 h1:C+7Si2b5qgXShERPqwtDu36i1o1yf1VM93A3GZIe9Fk=
gonum.org/v1/gonum v0.0.0-20190221132855-8ea67971a689/go.mod h1:jevfED4GnIEnJrWW55YmY9DMhajHcnkqVnEXmEtMyNI=
gonum.org/v1/netlib v0.0.0-20190221094214-0632e2ebbd2d h1:m4zHh49Vwhwq5n7qC7NRl5SqRfTyT/6PP2ASGNGRB1E=
gonum.org/v1/netlib v0.0.0-20190221094214-0632e2ebbd2d/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
gorgonia.org/cu v0.9.0-beta h1:s4WQ6fiAGoErwIiXWHRB6Y9ydkx1vTTPwhWzoEZVePc=
gorgonia.org/cu v0.9.0-beta/go.mod h1:RPEPIfaxxqUmeRe7T1T8a0NER+KxBI2McoLEXhP1Vd8=
gorgonia.org/dawson v1.1.0 h1:o7+eJ3SKi9sheH19lpOat//tDbg0Y+M9iY/lH79VHqY=
gorgonia.org/dawson v1.1.0/go.mod h1:Px1mcziba8YUBIDsbzGwbKJ11uIblv/zkln4jNrZ9Ws=
gorgonia.org/gorgonia v0.9.1 h1:6blWHSDHCplQHem+pvo9dZvtsQp7l3ZiVqXk26frn9M=
gorgonia.org/gorgonia v0.9.1/go.mod h1:qucT7YHm/2OuSHWEw/6Je/LQ5htRJNQJ1+qpB58fY8c=
gorgonia.org/tensor v0.9.0-beta h1:16QQufB1vbJxVbIOaB5TwkerdlBWtw+AAnZHUZ531ZE=
gorgonia.org/tensor v0.9.0-beta/go.mod h1:05Y4laKuVlj4qFoZIZW1q/9n1jZkgDBOLmKXZdBLG1w=
gorgonia.org/vecf32 v0.7.0 h1:mkpVzSyT7/Cput5/ZxaMzzp2xbmOtqOyJlTf7AdSMe0=
gorgonia.org/vecf32 v0.7.0/go.mod h1:iHG+kvTMqGYA0SgahfO2k62WRnxmHsqAREGbayRDzy8=
gorgonia.org/vecf64 v0.7.0 h1:ZphOGJfnWlFfY7x8WAJAfO64IAtYqPPq9TEGem+ItZE=
gorgonia.org/vecf64 v0.7.0/go.mod h1:1y4pmcSd+wh3phG+InwWQjYrqwyrtN9h27WLFVQfV1Q=


================================================
FILE: interfaces.go
================================================
package lingo

import (
	"encoding/gob"

	"gorgonia.org/tensor"
)

// Lemmatizer is anything that can lemmatize
type Lemmatizer interface {
	Lemmatize(string, POSTag) ([]string, error)
}

// Stemmer is anything that can stem
type Stemmer interface {
	Stem(string) (string, error)
}

// Sentencer is anything that returns an AnnotatedSentence
type Sentencer interface {
	Sentence() AnnotatedSentence
}

// Corpus is the interface for the corpus.
type Corpus interface {
	// ID returns the ID of a word and whether or not it was found in the corpus
	Id(word string) (id int, ok bool)

	// Word returns the word given the ID, and whether or not it was found in the corpus
	Word(id int) (word string, ok bool)

	// Add adds a word to the corpus and returns its ID. If a word was previously in the corpus, it merely updates the frequency count and returns the ID
	Add(word string) int

	// Size returns the size of the corpus.
	Size() int

	// WordFreq returns the frequency of the word. If the word wasn't in the corpus, it returns 0.
	WordFreq(word string) int

	// IDFreq returns the frequency of a word given an ID. If the word isn't in the corpus it returns 0.
	IDFreq(id int) int

	// TotalFreq returns the total number of words ever seen by the corpus. This number includes the count of repeat words.
	TotalFreq() int

	// MaxWordLength returns the length of the longest known word in the corpus
	MaxWordLength() int

	// WordProb returns the probability of a word appearing in the corpus
	WordProb(word string) (float64, bool)

	// IO stuff
	gob.GobEncoder
	gob.GobDecoder
}

// WordEmbeddings is any type that is both a corpus and can return word vectors
type WordEmbeddings interface {
	Corpus

	// WordVector returns a vector of embeddings given the word
	WordVector(word string) (vec tensor.Tensor, err error)

	// Vector returns a vector of embeddings given the word ID
	Vector(id int) (vec tensor.Tensor, err error)

	// Embedding returns the matrix
	Embedding() tensor.Tensor
}


================================================
FILE: io.go
================================================
package lingo

import (
	"bytes"
	"encoding/json"
	"fmt"
	"strings"

	"github.com/pkg/errors"
)

type dummyAnnotation struct {
	POSTag         `json:"POSTag"`
	DependencyType `json:"Label"`

	ID    int    `json:"ID"`
	Head  int    `json:"Head"`
	Value string `json:"Value"`
	Lemma string `json:"Lemma"`
	Stem  string `json:"Stem"`

	Cluster  `json:"Cluster"`
	Shape    `json:"Shape"`
	WordFlag `json:"WordFlat"`
}

// func (a *Annotation) MarshalText() ([]byte, error) {
// 	var buf bytes.Buffer
// 	if a.Head != nil {
// 		fmt.Fprintf(&buf, "%v(%q/%v-%d, %q/%v-%d)", a.DependencyType, a.Value, a.POSTag, a.ID, a.Head.Value, a.Head.POSTag, a.Head.ID)
// 	} else if a == rootAnnotation {
// 		fmt.Fprintf(&buf, "ROOT")
// 	} else {
// 		fmt.Fprintf(&buf, "%q/%v-%d", a.Value, a.POSTag, a.ID)
// 	}
// 	return buf.Bytes(), nil
// }

func (a *Annotation) MarshalJSON() ([]byte, error) {
	var buf bytes.Buffer
	buf.WriteRune('{')

	fmt.Fprintf(&buf, "\"ID\": %d,", a.ID)
	fmt.Fprintf(&buf, "\"Value\": %q,", a.Value)
	fmt.Fprintf(&buf, "\"POSTag\": \"%v\",", a.POSTag)
	fmt.Fprintf(&buf, "\"Label\": \"%v\"", a.DependencyType)

	if a.Head != nil {
		if a.Head == rootAnnotation {
			fmt.Fprintf(&buf, ", \"Head\": -1000") // special signifier for root annotations
		} else {
			fmt.Fprintf(&buf, ", \"Head\": %d", a.HeadID())
		}
	}

	if a.Lemma != "" {
		fmt.Fprintf(&buf, ", \"Lemma\": %q", a.Lemma)
	}

	// Lowered is not serialized because it's a simple function call away

	if a.Stem != "" {
		fmt.Fprintf(&buf, ",\"Stem\": %q", a.Stem)
	}

	if a.Cluster > 0 {
		fmt.Fprintf(&buf, ",\"Cluster\": %d", a.Cluster)
	}

	if a.Shape != "" {
		fmt.Fprintf(&buf, ",\"Shape\": %q", a.Shape)
	}

	if a.WordFlag > 0 {
		fmt.Fprintf(&buf, ",\"WordFlag\": %d", a.WordFlag)
	}
	buf.WriteRune('}')
	return buf.Bytes(), nil
}

func (a *Annotation) UnmarshalJSON(b []byte) error {
	if a == nil {
		// error
		return errors.Errorf("Cannot unmarshal json to a nul")
	}

	d := dummyAnnotation{}
	if err := json.Unmarshal(b, &d); err != nil {
		return err
	}

	a.Value = d.Value
	a.POSTag = d.POSTag
	a.DependencyType = d.DependencyType
	a.ID = d.ID
	a.Lemma = d.Lemma
	a.Stem = d.Stem
	a.Cluster = d.Cluster
	a.Shape = d.Shape
	a.WordFlag = d.WordFlag

	return nil
}

func (as AnnotatedSentence) MarshalJSON() ([]byte, error) {
	buf := new(bytes.Buffer)
	encoder := json.NewEncoder(buf)

	buf.WriteRune('[')
	for i, a := range as {
		if err := encoder.Encode(a); err != nil {
			return nil, err
		}
		if i < len(as)-1 {
			buf.WriteRune(',')
		}
	}
	buf.WriteRune(']')
	return buf.Bytes(), nil
}

func (as *AnnotatedSentence) UnmarshalJSON(b []byte) error {
	dummies := make([]dummyAnnotation, 0)

	if err := json.Unmarshal(b, &dummies); err != nil {
		return err
	}

	asL := len(*as)
	l := len(dummies)
	if asL != l {
		diff := l - asL
		(*as) = append(*as, make(AnnotatedSentence, diff)...)
	}

	for i, d := range dummies {
		a := (*as)[i]
		if d.Value == "-ROOT-" {
			(*as)[i] = rootAnnotation
			continue
		}

		if a == nil {
			a = new(Annotation)
		}

		a.Value = d.Value
		a.POSTag = d.POSTag
		a.DependencyType = d.DependencyType
		a.ID = d.ID
		a.Lemma = d.Lemma
		a.Stem = d.Stem
		a.Cluster = d.Cluster
		a.Shape = d.Shape
		a.WordFlag = d.WordFlag

		(*as)[i] = a
	}

	// fix up head IDs
	for i, d := range dummies {
		a := (*as)[i]
		head := d.Head
		if head == -1000 {
			a.SetHead(rootAnnotation)
		} else {
			a.SetHead((*as)[head])
		}
	}

	// TODO: fix up other things
	for _, a := range *as {
		a.Lowered = strings.ToLower(a.Value)
	}

	return nil
}


================================================
FILE: io_test.go
================================================
package lingo

import (
	"encoding/json"
	"testing"
)

func TestAnnotationJSON(t *testing.T) {
	a := NewAnnotation()
	a.Value = "hello"
	a.POSTag = NOUN
	a.DependencyType = Aux
	a.ID = 2

	b, err := json.Marshal(a)
	if err != nil {
		t.Error(err)
	}
	t.Logf(" %s", string(b))

	x := `{"ID":2,"Value":"hello","POSTag":"NOUN","Label":"Aux"}`
	c := NewAnnotation()
	if err = json.Unmarshal([]byte(x), c); err != nil {
		t.Error(err)
	}

	if c.Value != a.Value {
		t.Errorf("Expected Value to be %q. Got %q insteed", a.Value, c.Value)
	}

	if c.POSTag != a.POSTag {
		t.Errorf("Expected POSTag to be %v. Got %v instead", a.POSTag, c.POSTag)
	}

	if c.DependencyType != a.DependencyType {
		t.Errorf("Expected DependencyType to be %v. Got %v instead", a.DependencyType, c.DependencyType)
	}
}

func TestAnnotatedSentenceJSON(t *testing.T) {
	a := NewAnnotation()
	a.Value = "hello"
	a.POSTag = NOUN
	a.DependencyType = Aux
	a.ID = 0

	b := NewAnnotation()
	b.Value = "world"
	b.POSTag = NOUN
	b.DependencyType = Aux
	b.ID = 1
	b.Head = rootAnnotation

	a.Head = b

	as := AnnotatedSentence{a, b}
	bs, err := json.Marshal(as)
	if err != nil {
		t.Fatal(err)
	}
	t.Logf("%s", string(bs))

	x := `[{"ID":0,"Value":"hello","POSTag":"NOUN","Label":"Aux","Head":1},{"ID":1,"Value":"world","POSTag":"NOUN","Label":"Aux","Head":-1000}]`

	var cs AnnotatedSentence
	if err = json.Unmarshal([]byte(x), &cs); err != nil {
		t.Error(err)
	}
	t.Logf("%v", cs)

	for i, c := range cs {
		d := as[i]

		if c.Value != d.Value {
			t.Error("Expected Values to be the same")
		}

		if c.POSTag != d.POSTag {
			t.Error("POSTag not the same")
		}

		if c.DependencyType != d.DependencyType {
			t.Error("Dependency Types not the same")
		}

		if c.HeadID() != d.HeadID() {
			t.Errorf("%v HeadIDs not the same. Want %v, got %v instead", d, d.HeadID(), c.HeadID())
		}
	}
}


================================================
FILE: lexeme.go
================================================
package lingo

import (
	"fmt"
	"unicode"
)

//go:generate stringer -type=LexemeType

type LexemeType byte

const (
	EOF LexemeType = iota
	Word
	Disambig
	URI
	Number
	Date
	Time
	Punctuation
	Symbol
	Space
	SystemUse
)

type Lexeme struct {
	Value      string
	LexemeType LexemeType

	Line int
	Col  int
	Pos  int
}

func MakeLexeme(s string, t LexemeType) Lexeme {
	return Lexeme{
		Value:      s,
		LexemeType: t,
		Line:       -1,
		Col:        -1,
		Pos:        -1,
	}
}

func (l Lexeme) Fix() Lexeme {
	if StringIs(l.Value, unicode.IsDigit) {
		l.LexemeType = Number
		return l
	}
	return l
}

func (l Lexeme) String() string {
	switch l.LexemeType {
	case EOF:
		return "EOF"
	default:
		return fmt.Sprintf("%q/%v", l.Value, l.LexemeType)
	}
}

func (l Lexeme) GoString() string {
	switch l.LexemeType {
	case EOF:
		return fmt.Sprintf("EOF: %q (%d, %d, %d)", l.Value, l.Line, l.Col, l.Pos)
	default:
		return fmt.Sprintf("%s: %q (%d, %d, %d)", l.LexemeType, l.Value, l.Line, l.Col, l.Pos)
	}
}

var startLexeme = MakeLexeme("START_LEXEME", SystemUse)
var rootLexeme = MakeLexeme("-ROOT-", SystemUse)
var nullLexeme = MakeLexeme("", SystemUse)

func StartLexeme() Lexeme { return startLexeme }
func RootLexeme() Lexeme  { return rootLexeme }
func NullLexeme() Lexeme  { return nullLexeme }


================================================
FILE: lexemetype_string.go
================================================
// Code generated by "stringer -type=LexemeType"; DO NOT EDIT

package lingo

import "fmt"

const _LexemeType_name = "EOFWordDisambigURINumberDateTimePunctuationSymbolSpaceSystemUse"

var _LexemeType_index = [...]uint8{0, 3, 7, 15, 18, 24, 28, 32, 43, 49, 54, 63}

func (i LexemeType) String() string {
	if i >= LexemeType(len(_LexemeType_index)-1) {
		return fmt.Sprintf("LexemeType(%d)", i)
	}
	return _LexemeType_name[_LexemeType_index[i]:_LexemeType_index[i+1]]
}


================================================
FILE: lexer/lexer.go
================================================
package lexer

import (
	"bufio"
	"bytes"
	"io"
	"strings"
	"sync"

	"golang.org/x/text/unicode/norm"

	"github.com/chewxy/lingo"
)

const eof rune = -1

type Lexer struct {
	name  string
	input *bufio.Reader

	state stateFn
	r     rune
	width int
	pos   int
	start int
	line  int
	col   int

	// the string we're reading
	buf *bytes.Buffer

	Output chan lingo.Lexeme
	Errors chan error

	sync.Mutex
}

func New(name string, r io.Reader) *Lexer {
	return &Lexer{
		name:  name,
		input: bufio.NewReader(r),

		width: 1,
		start: 1, // for userfriendliness, the column index starts at 1
		col:   1,
		pos:   1,
		buf:   new(bytes.Buffer),

		Output: make(chan lingo.Lexeme),
		Errors: make(chan error),
	}
}

func (l *Lexer) Run() {
	l.Lock()
	defer l.Unlock()
	defer close(l.Output)
	for state := lexText; state != nil; {
		state = state(l)
	}
}

// Reset resets the buffers. It creates a new Output and Error channel
func (l *Lexer) Reset(r io.Reader) {
	l.Lock()
	l.input.Reset(r)
	l.buf.Reset()
	l.Output = make(chan lingo.Lexeme)
	l.Errors = make(chan error)
	l.Unlock()
}

func (l *Lexer) next() rune {
	var err error
	l.r, l.width, err = l.input.ReadRune()
	if err == io.EOF {
		l.width = 1
		return eof
	}
	l.col += l.width
	l.pos += l.width

	return l.r
}

// nextUntilEOF will loop until it finds the matching string OR EOF
func (l *Lexer) nextUntilEOF(s string) bool {
	for r := l.next(); r != eof && strings.IndexRune(s, r) < 0; r = l.next() {
		// l.next()
		l.accept()
	}
	if l.r == eof {
		return true
	}
	return false
}

func (l *Lexer) backup() {
	l.input.UnreadRune()
	l.pos -= l.width
	l.col -= l.width
}

func (l *Lexer) peek() rune {
	backup := l.r
	pos := l.pos
	col := l.col

	r := l.next()
	l.backup()

	l.pos = pos
	l.col = col
	l.r = backup
	return r
}

func (l *Lexer) lineCount() {
	newLines := bytes.Count(l.buf.Bytes(), []byte("\n"))

	l.line += newLines
	if newLines > 0 {
		l.col = 1
	}
}

func (l *Lexer) accept() {
	l.buf.WriteRune(l.r)
}

func (l *Lexer) acceptRun(valid string) (accepted bool) {
	for strings.IndexRune(valid, l.peek()) >= 0 {
		l.next()
		l.accept()
		accepted = true
	}
	return
}

func (l *Lexer) acceptRunFn(fn func(rune) bool) (accepted int) {
	for fn(l.peek()) {
		l.next()
		l.accept()
		accepted++
	}
	return
}

func (l *Lexer) ignore() {
	l.start = l.pos
	l.buf.Reset()
}

func (l *Lexer) emit(t lingo.LexemeType) {
	normalized := string(norm.NFC.Bytes(l.buf.Bytes()))
	lex := lingo.MakeLexeme(normalized, t)
	lex.Line = l.line
	lex.Col = l.start
	lex.Pos = l.pos - l.buf.Len()

	// TODO: sometimes the offset is wrong on leading tokens since l.pos starts at 1
	// if lex.Pos < 0 {
	// 	lex.Pos = 0
	// }

	l.Output <- lex

	// reset
	l.ignore()
	if l.r != 0x0 {
		l.buf.WriteRune(l.r)
	}
}


================================================
FILE: lexer/lexer_test.go
================================================
package lexer

import (
	"strings"
	"testing"

	"github.com/chewxy/lingo"
)

type lexerTest struct {
	name string
	s    string

	lexemes []lingo.Lexeme
}

var lexerTests = []lexerTest{
	// {"empty", "", []lingo.Lexeme{
	// 	{"", lingo.EOF, 0, 1, 0},
	// }},
	//
	// {"spaces", " \t", []lingo.Lexeme{
	// 	{"", lingo.EOF, 0, 3, 2},
	// }},
	//
	// {"newlines", "\n\r\n\n", []lingo.Lexeme{
	// 	{"", lingo.EOF, 3, 5, 4},
	// }},
	//
	// {"simple text", "hello world", []lingo.Lexeme{
	// 	{"hello", lingo.Word, 0, 1, 0},
	// 	{"world", lingo.Word, 0, 7, 6},
	// 	{"", lingo.EOF, 0, 12, 11},
	// }},
	//
	// {"simple number", "3.1415", []lingo.Lexeme{
	// 	{"3.1415", lingo.Number, 0, 1, 0},
	// 	{"", lingo.EOF, 0, 12, 5},
	// }},

	{"advanced numerology", "3.14 -1.618", []lingo.Lexeme{
		{"3.14", lingo.Number, 0, 1, 0},
		{"-1.618", lingo.Number, 0, 6, 5},
		{"", lingo.EOF, 0, 11, 10},
	}},

	// {"advanced numerology", "3.14 -1.618 6.023e23 1e-13", []lingo.Lexeme{
	// 	{"3.14", lingo.Number, 0, 1, 0},
	// 	{"-1.618", lingo.Number, 0, 6, 5},
	// 	{"6.023e23", lingo.Number, 0, 13, 12},
	// 	{"1e-13", lingo.Number, 0, 21, 20},
	// 	{"", lingo.EOF, 0, 26, 25},
	// }},
	//
	// {"esoteric numerology", "1/2 1 1/4", []lingo.Lexeme{
	// 	{"1/2", lingo.Number, 0, 1, 0},
	// 	{"1", lingo.Number, 0, 5, 4},
	// 	{"1/4", lingo.Number, 0, 7, 6},
	// 	{"", lingo.EOF, 0, 10, 9},
	// }},
	//
	// {"text with numbers", "one plus 1 don't equals 3", []lingo.Lexeme{
	// 	{"one", lingo.Word, 0, 1, 0},
	// 	{"plus", lingo.Word, 0, 5, 4},
	// 	{"1", lingo.Number, 0, 10, 9},
	// 	{"do", lingo.Word, 0, 12, 11},
	// 	{"n't", lingo.Word, 0, 14, 13},
	// 	{"equals", lingo.Word, 0, 18, 17},
	// 	{"3", lingo.Number, 0, 24, 23},
	// 	{"", lingo.EOF, 0, 25, 24},
	// }},
	//
	// {"text with numbers + punct", "First111!.!", []lingo.Lexeme{
	// 	{"First111", lingo.Word, 0, 1, 0},
	// 	{"!.!", lingo.Punctuation, 0, 9, 8},
	// 	{"", lingo.EOF, 0, 10, 9},
	// }},
	//
	// {"text with verb contractions", "You're panic'd I'll get'em I've", []lingo.Lexeme{
	// 	{"You", lingo.Word, 0, 1, 0},
	// 	{"'re", lingo.Word, 0, 3, 2},
	// 	{"panic", lingo.Word, 0, 8, 7},
	// 	{"'d", lingo.Word, 0, 13, 12},
	// 	{"I", lingo.Word, 0, 16, 15},
	// 	{"'ll", lingo.Word, 0, 17, 16},
	// 	{"get", lingo.Word, 0, 21, 20},
	// 	{"'em", lingo.Word, 0, 24, 23},
	// 	{"I", lingo.Word, 0, 27, 26},
	// 	{"'ve", lingo.Word, 0, 30, 29},
	// 	{"", lingo.EOF, 0, 33, 32},
	// }},
	//
	// {"email", "dont@email.this", []lingo.Lexeme{
	// 	{"dont@email.this", lingo.Word, 0, 1},
	// 	{"", lingo.EOF, 0, 10},
	// }},
	//
	// {"plain dashes should not be numbers", "this case - like so", []lingo.Lexeme{
	// 	{"this", lingo.Word, 0, 1},
	// 	{"case", lingo.Word, 0, 5},
	// 	{"-", lingo.Punctuation, 0, 6},
	// 	{"like", lingo.Word, 0, 8},
	// 	{"so", lingo.Word, 0, 13},
	// 	{"", lingo.EOF, 0, 14},
	// }},
	//
	// {"parens should be printed", "like (this)", []lingo.Lexeme{
	// 	{"like", lingo.Word, 0, 1},
	// 	{"(", lingo.Punctuation, 0, 5},
	// 	{"this", lingo.Word, 0, 6},
	// 	{")", lingo.Punctuation, 0, 10},
	// 	{"", lingo.EOF, 0, 11},
	// }},
	//
	// {"parenthesis should be considered separate", "USA(United States of America)", []lingo.Lexeme{
	// 	{"USA", lingo.Word, 0, 1},
	// 	{"(", lingo.Punctuation, 0, 1},
	// 	{"United", lingo.Word, 0, 1},
	// 	{"States", lingo.Word, 0, 1},
	// 	{"of", lingo.Word, 0, 1},
	// 	{"America", lingo.Word, 0, 1},
	// 	{")", lingo.Punctuation, 0, 1},
	// 	{"", lingo.EOF, 0, 0},
	// }},
	//
	// {"midstream puncuation", "like:this", []lingo.Lexeme{
	// 	{"like", lingo.Word, 0, 1},
	// 	{":", lingo.Punctuation, 0, 5},
	// 	{"this", lingo.Word, 0, 6},
	// 	{"", lingo.EOF, 0, 7},
	// }},
	//
	// {"midstream symbols", "e-meet ke$ha by e-mail $ell anti-inflammatory", []lingo.Lexeme{
	// 	{"e-meet", lingo.Word, 0, 1},
	// 	{"ke$ha", lingo.Word, 0, 1},
	// 	{"by", lingo.Word, 0, 1},
	// 	{"e-mail", lingo.Word, 0, 1},
	// 	{"$", lingo.Symbol, 0, 1},
	// 	{"ell", lingo.Word, 0, 1},
	// 	{"anti-inflammatory", lingo.Word, 0, 1},
	// 	{"", lingo.EOF, 0, 0},
	// }},
	//
	// {"abbrev", "USB, made in U.S.A. e.g t/away c/o", []lingo.Lexeme{
	// 	{"USB", lingo.Word, 0, 1},
	// 	{",", lingo.Punctuation, 0, 4},
	// 	{"made", lingo.Word, 0, 6},
	// 	{"in", lingo.Word, 0, 11},
	// 	{"U.S.A", lingo.Word, 0, 14},
	// 	{".", lingo.Punctuation, 0, 19},
	// 	{"e.g", lingo.Word, 0, 0},
	// 	{"t/away", lingo.Word, 0, 0},
	// 	{"c/o", lingo.Word, 0, 0},
	// 	{"", lingo.EOF, 0, 20},
	// }},
	//
	// {"date time", "1970/1/1 00:00 00:00:00", []lingo.Lexeme{
	// 	{"1970/1/1", lingo.Date, 0, 1},
	// 	{"00:00", lingo.Time, 0, 1},
	// 	{"00:00:00", lingo.Time, 0, 20},
	// 	{"", lingo.EOF, 0, 20},
	// }},
	//
	// {"date time with dashes", "31-12-1970", []lingo.Lexeme{
	// 	{"31/12/1970", lingo.Date, 0, 1},
	// 	{"", lingo.EOF, 0, 11},
	// }},
	//
	// {"URI", "wobsite: http://www.wobsite.something.this/is/still/a.url", []lingo.Lexeme{
	// 	{"wobsite", lingo.Word, 0, 1},
	// 	{":", lingo.Punctuation, 0, 8},
	// 	{"http://www.wobsite.something.this/is/still/a.url", lingo.URI, 0, 10},
	// 	{"", lingo.EOF, 0, 20},
	// }},
	//
	// {"proper sentence", "hello world.", []lingo.Lexeme{
	// 	{"hello", lingo.Word, 0, 1},
	// 	{"world", lingo.Word, 0, 6},
	// 	{".", lingo.Punctuation, 0, 7},
	// 	{"", lingo.EOF, 0, 8},
	// }},
	//
	// // Naive and Cafe uses combination diacritics, while the rest are just unicode
	// // The lexer should normalize all the things
	// {"pathological english words", "Façade à la Naïve Château Café", []lingo.Lexeme{
	// 	{"Façade", lingo.Word, 0, 1},
	// 	{"à", lingo.Word, 0, 1},
	// 	{"la", lingo.Word, 0, 1},
	// 	{"Naïve", lingo.Word, 0, 1},
	// 	{"Château", lingo.Word, 0, 1},
	// 	{"Café", lingo.Word, 0, 1},
	// 	{"", lingo.EOF, 0, 0},
	// }},
	//
	// // just plain fucked
	// {"jpf", "你好 العالم", []lingo.Lexeme{
	// 	{"你好", lingo.Word, 0, 1},
	// 	{"العالم", lingo.Word, 0, 1},
	// 	{"", lingo.EOF, 0, 0},
	// }},
}

func testLexer(lts *lexerTest) []lingo.Lexeme {
	l := New(lts.name, strings.NewReader(lts.s))
	var retVal []lingo.Lexeme

	go l.Run()
	for lex := range l.Output {
		retVal = append(retVal, lex)
	}
	return retVal
}

func TestLexer(t *testing.T) {
	for _, lts := range lexerTests {
		lexemes := testLexer(&lts)

		if len(lts.lexemes) != len(lexemes) {
			t.Errorf("Test %q: Expected %d lexemes. Got %d instead: %v", lts.name, len(lts.lexemes), len(lexemes), lexemes)
			continue
		}

		for i, lex := range lexemes {
			if lex.LexemeType != lts.lexemes[i].LexemeType || lex.Value != lts.lexemes[i].Value || lts.lexemes[i].Pos != lex.Pos {
				t.Errorf("Test %q, lexeme %d: Expected %#v. Got %#v instead", lts.name, i, lts.lexemes[i], lex)
			}
		}
	}
}


================================================
FILE: lexer/stateFn.go
================================================
package lexer

import (
	"unicode"

	"github.com/chewxy/lingo"
)

type stateFn func(*Lexer) stateFn

func lexText(l *Lexer) (fn stateFn) {
	for {
		next := l.next()
		if next == eof {
			break
		}
		if l.pos != l.start {
			switch {
			case unicode.IsSpace(next):
				l.backup()
				fn = lexWhitespace
			case unicode.IsDigit(next):

				// if the position is start +1.
				// This means that the first char of the string to be lexed is a number
				// this prevents things like "yay1111" to be lexed as "yay" and "1111"
				if l.pos == l.start+1 {
					l.backup()
					return lexNumber
				}
			case next == ':':
				// possible URI
				if l.peek() == '/' {
					l.accept() // accept ':'
					l.next()
					if l.peek() == '/' {
						l.accept()
						return lexURI
					}
					// otherwise...
					l.backup()
					// "unaccept". since '/' has a width of 1 we can do the following
					l.buf.Truncate(l.buf.Len() - 1)
				}
				fn = lexPunctuation
			case unicode.IsPunct(next):
				// For things like "u.s" or "i.e." or "e.g."
				n := l.peek()

				switch {
				case next == '\'':
					if unicode.IsLetter(n) {
						l.emit(lingo.Word)
						return lexText
					}
				case n == eof:
					// common scenario - where a punctuation ends the sentence, and this thing is unable to backup
					l.width = 1
					l.backup()
					l.width = 0
					fn = lexPunctuation
					goto finishup // goto because there are other cases below
				case unicode.IsLetter(n) && (next == '.' || next == '@' || next == '-' || next == '/'):
					// acceptable midstream punctuations in words are emails and abbreviations
					l.accept()
					return lexText
				default:
					// it's definitely a punctuation
					l.backup()
					fn = lexPunctuation
				}

			case unicode.IsSymbol(next):
				// for things like "ke$ha"
				// bear in mind that "$ell" will be split into two lexemes.
				n := l.peek()
				if unicode.IsLetter(n) {
					l.backup()
					l.accept()
					return lexText
				}
				//l.backup()
				fn = lexSymbol
			case next == 'n':
				// for things like "don't" or "doesn't"
				n := l.peek()
				if n == '\'' {
					l.backup()
					l.emit(lingo.Word)
					return lexPunctuation
				} else {
					l.accept() // accept n
					return lexText
				}
			}
		}

	finishup:
		if fn != nil {
			if l.start != l.pos {
				l.emit(lingo.Word)
			}
			return fn
		}
		// otherwise keep lexText
		l.accept()
	}

	if l.pos > l.start {
		l.emit(lingo.Word)
	}

	l.emit(lingo.EOF)
	return nil
}

// lexNumber lexes numbers. It accepts runs of unicode digits.
// Upon stopping, it checks to see if the next value is a '.'. If it is, then it's a decimal value, and continues a run
// Upon stopping a second time, it checks for 'e' or 'E', for exponentiation - 1.2E2
func lexNumber(l *Lexer) (fn stateFn) {
	l.acceptRunFn(unicode.IsDigit)

	next := l.next()
	switch next {
	case '.':
		l.accept() // accept the dot
		l.acceptRunFn(unicode.IsDigit)
	case '-', '/':
		// standardize
		l.r = '/'
		l.accept()
		return lexDate
	case ':':
		if l.pos-l.start == 3 {
			l.accept()
			return lexTime
		} else {
			l.backup()
			l.emit(lingo.Number)
			return lexPunctuation
		}
	default:
		l.backup()
	}

	if l.acceptRun("eE") {
		// handle negative exponents
		if l.peek() == '-' {
			l.next()
			l.accept()
			return lexNumber(l)
		}
		l.acceptRunFn(unicode.IsDigit)
	}
	l.backup()

	if l.buf.Len() == 1 && l.buf.Bytes()[0] == '-' {
		l.emit(lingo.Punctuation) // dash
		return lexWhitespace
	}
	l.emit(lingo.Number)
	return lexWhitespace
}

func lexWhitespace(l *Lexer) (fn stateFn) {
	l.acceptRunFn(unicode.IsSpace)
	l.lineCount()
	// l.incrementLineCount()
	// l.backup()
	l.ignore() //nothing will be emitted

	next := l.peek()
	switch {
	case unicode.IsDigit(next):
		return lexNumber
	case unicode.IsPunct(next):
		if next == '-' {
			l.next()
			l.accept()
			return lexNumber
		}
		return lexPunctuation
	case unicode.IsSymbol(next):
		return lexSymbol
	}

	return lexText
}

func lexPunctuation(l *Lexer) (fn stateFn) {
	next := l.next()
	switch next {
	case '\'':
		l.accept()
		n := l.peek()
		switch n {
		case 't', 's', 'm', 'd':
			l.next()
			l.accept() // accept 't'/'s'...
			l.emit(lingo.Word)
			return lexWhitespace
		}
	case '.':
		l.accept()
		// for cases such as "U.S" or "i.e"
		n := l.peek()
		if unicode.IsLetter(n) {
			l.accept() // accept .
			l.next()
			l.accept()
			return lexText
		}
	default:
	}

	accepted := l.acceptRunFn(unicode.IsPunct) // check for any other runs of punctuations
	punct := unicode.IsPunct(next)
	if accepted == 0 && punct {
		l.accept()
	}
	l.emit(lingo.Punctuation)
	if accepted == 0 && !punct && !unicode.IsSpace(next) {
		return lexText
	}
	return lexWhitespace
}

func lexSymbol(l *Lexer) (fn stateFn) {
	l.acceptRunFn(unicode.IsSymbol)
	l.acceptRunFn(unicode.IsPunct) // any symbol punctuation combination should be treated as a symbole
	l.emit(lingo.Symbol)
	return lexWhitespace
}

func lexURI(l *Lexer) (fn stateFn) {
	eof := l.nextUntilEOF(" ")
	if !eof {
		l.backup()
		l.backup()
		next := l.next()
		if unicode.IsPunct(next) {
			l.backup()
			l.emit(lingo.URI)
			return lexPunctuation
		}
	}

	l.emit(lingo.URI)
	return lexWhitespace
}

func lexDate(l *Lexer) (fn stateFn) {
	l.acceptRunFn(unicode.IsDigit)
	next := l.next()
	if next != '/' && next != '-' {
		l.backup()
		l.emit(lingo.Number) // fractions are numbers
		return lexWhitespace
	}
	l.r = '/' // standardize
	l.accept()

	l.acceptRunFn(unicode.IsDigit)
	l.emit(lingo.Date)
	return lexWhitespace
}

func lexTime(l *Lexer) (fn stateFn) {
	l.acceptRunFn(unicode.IsDigit)
	next := l.next()
	if next != ':' {
		l.backup()
		l.emit(lingo.Time)
		return lexWhitespace
	}
	l.accept()
	l.acceptRunFn(unicode.IsDigit)
	l.emit(lingo.Time)
	return lexWhitespace
}


================================================
FILE: lingo.go
================================================
// package lingo provides the data structures and algorithms required for natural language processing.
package lingo


================================================
FILE: pos/allinone_test.go
================================================
package pos

import (
	"log"
	"strings"
	"testing"

	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/lexer"
	"github.com/chewxy/lingo/treebank"
)

func TestEverything(t *testing.T) {
	sentences := treebank.ReadConllu(strings.NewReader(conllu))

	sentence := "President Bush comes on federal courts."

	p := New(WithCluster(clusters), WithLemmatizer(dummyLem{}), WithStemmer(dummyStemmer{}))
	p.Train(sentences, 200)

	l := lexer.New(sentence, strings.NewReader(sentence))
	p2 := p.Clone()
	p2.Input = l.Output

	var correct string
	if lingo.BUILD_TAGSET == "stanfordtags" {
		correct = "-ROOT-/ROOT_TAG President/NNP Bush/NNP comes/DT on/IN federal/JJ courts/NN ./FULLSTOP"
	} else {
		correct = "-ROOT-/ROOT_TAG President/PROPN Bush/PROPN comes/VERB on/ADP federal/ADJ courts/NOUN ./PUNCT"
	}

	go l.Run()
	go p2.Run()
	for a := range p2.Output {

		// this clearly isn't gonna be accurate, given the stubbed out Lemmatizer
		if a.String() != correct {
			t.Error("Something went wrong with the POSTagging")
			log.Printf("%v", a)
		}
	}

}


================================================
FILE: pos/context.go
================================================
package pos

import (
	"strconv"

	"github.com/chewxy/lingo"
)

/*
A context is which word in the current state the POSTagger is in.
There are so far  5 contexts:
	- Previous previous word
	- previous word
	- current word
	- next word
	- next next word

For each context we have 8 features:
	- word (lower case)
	- lemma
	- cluster
	- shape
	- prefix (first 1)
	- suffix (last 3)
	- POSTag
	- wordflag
*/

//go:generate stringer -type=contextType
type contextType byte

const featuresPerContext = 8
const contexts = 5
const (
	// previous previous (prev2)
	prev2Word contextType = iota
	prev2Lemma
	prev2Cluster
	prev2Shape
	prev2Prefix1
	prev2Suffix3
	prev2POSTag
	prev2Flags

	// previous
	prevWord
	prevLemma
	prevCluster
	prevShape
	prevPrefix1
	prevSuffix3
	prevPOSTag
	prevFlags

	// ith token
	ithWord
	ithLemma
	ithCluster
	ithShape
	ithPrefix1
	ithSuffix3
	ithPOSTag
	ithFlags

	// next token
	nextWord
	nextLemma
	nextCluster
	nextShape
	nextPrefix1
	nextSuffix3
	nextPOSTag
	nextFlags

	// next next token
	next2Word
	next2Lemma
	next2Cluster
	next2Shape
	next2Prefix1
	next2Suffix3
	next2POSTag
	next2Flags

	MAXCONTEXTTYPE
)

type contextMap [MAXCONTEXTTYPE]string

func getContext(prev2, prev, ith, next, next2 *lingo.Annotation) (retVal contextMap) {
	var listOfFeats = [contexts][featuresPerContext]string{
		extractContext(prev2),
		extractContext(prev),
		extractContext(ith),
		extractContext(next),
		extractContext(next2),
	}

	for i, l := range listOfFeats {
		for j, s := range l {
			retVal[i*featuresPerContext+j] = s
		}
	}

	return retVal
}

// type featureContext struct {
// 	word    string
// 	lemma   string
// 	cluster lingo.Cluster
// 	shape   string
// 	prefix  string
// 	suffix  string
// 	POSTag  lingo.POSTag
// 	flag    lingo.WordFlag
// }

// extractContext extracts the feature contexts from a given annotation
func extractContext(a *lingo.Annotation) (retVal [featuresPerContext]string) {
	if a == nil {
		return retVal
	}

	word := a.Lowered

	// we normalize all the unicode btes first
	asRunes := []rune(a.Value)
	loweredRunes := []rune(word)

	retVal[0] = word
	retVal[1] = a.Lemma
	retVal[2] = strconv.Itoa(int(a.Cluster))
	retVal[3] = string(a.Shape)

	// prefix and suffix
	// we want the characters, not the bytes
	// for the prefix, we'll use the un-normalized version because having that extra fidelity would be useful
	if len(asRunes) > 0 {
		retVal[4] = string(asRunes[0])
	} else {
		retVal[4] = ""
	}
	if len(loweredRunes) >= 3 {
		retVal[5] = string(loweredRunes[len(loweredRunes)-3 : len(loweredRunes)])
	} else {
		retVal[5] = ""
	}
	retVal[6] = a.POSTag.String()
	retVal[7] = a.WordFlag.String()

	return retVal
}


================================================
FILE: pos/context_test.go
================================================
package pos

import (
	"strings"
	"testing"

	"github.com/chewxy/lingo"
)

var extractContextTest = []struct {
	val string
	tag lingo.POSTag

	shape string
	pref  string
	suff  string
	flag  string
	clust string
}{
	{"TEst", lingo.ROOT_TAG, "XXxx", "T", "est", "00000000000110", "1"},
	{"TEst", lingo.X, "XXxx", "T", "est", "00000000000110", "1"},
	{"NotInClust", lingo.UNKNOWN_TAG, "XxxXxXxxxx", "N", "ust", "00000000000110", "0"},
	{"", lingo.X, "", "", "", "00000101111110", "0"},
}

func TestExtractContext(t *testing.T) {

	for i, ects := range extractContextTest {
		a := lingo.StringToAnnotation(ects.val, dummyFix{})
		a.POSTag = ects.tag

		res := extractContext(a)

		if res[0] != strings.ToLower(ects.val) {
			t.Errorf("Test %d: Expected word feature to be %q. Got %q instead", i, strings.ToLower(ects.val), res[0])
		}

		if res[2] != ects.clust {
			t.Errorf("Test %d: Expected cluster to be %q. Got %q instead", i, ects.clust, res[2])
		}

		if res[3] != ects.shape {
			t.Errorf("Test %d: Expected shape to be %q. Got %q instead", i, ects.shape, res[3])
		}

		if res[4] != ects.pref {
			t.Errorf("Test %d: Expected prefix to be %q. Got %q instead", i, ects.pref, res[4])
		}

		if res[5] != ects.suff {
			t.Errorf("Test %d: Expected suffix to be %q. Got %q instead", i, ects.suff, res[5])
		}

		if res[6] != ects.tag.String() {
			t.Errorf("Test %d: Expected postag to be %q. Got %q instead", i, ects.tag, res[6])
		}

		if res[7] != ects.flag {
			t.Errorf("Test %d: Expected flag to be %q. Got %q instead", i, ects.flag, res[7])
		}
	}

}


================================================
FILE: pos/contexttype_string.go
================================================
// generated by stringer -type=contextType; DO NOT EDIT

package pos

import "fmt"

const _contextType_name = "prev2Wordprev2Lemmaprev2Clusterprev2Shapeprev2Prefix1prev2Suffix3prev2POSTagprev2FlagsprevWordprevLemmaprevClusterprevShapeprevPrefix1prevSuffix3prevPOSTagprevFlagsithWordithLemmaithClusterithShapeithPrefix1ithSuffix3ithPOSTagithFlagsnextWordnextLemmanextClusternextShapenextPrefix1nextSuffix3nextPOSTagnextFlagsnext2Wordnext2Lemmanext2Clusternext2Shapenext2Prefix1next2Suffix3next2POSTagnext2FlagsMAXCONTEXTTYPE"

var _contextType_index = [...]uint16{0, 9, 19, 31, 41, 53, 65, 76, 86, 94, 103, 114, 123, 134, 145, 155, 164, 171, 179, 189, 197, 207, 217, 226, 234, 242, 251, 262, 271, 282, 293, 303, 312, 321, 331, 343, 353, 365, 377, 388, 398, 412}

func (i contextType) String() string {
	if i >= contextType(len(_contextType_index)-1) {
		return fmt.Sprintf("contextType(%d)", i)
	}
	return _contextType_name[_contextType_index[i]:_contextType_index[i+1]]
}


================================================
FILE: pos/debug.go
================================================
// +build debug

package pos

import (
	"log"
	"strings"
	"sync/atomic"
)

const BUILD_DEBUG = "POS TAGGER: Debug Build"

var TABCOUNT uint32 = 0

var tracking = false

func tabcount() int {
	return int(atomic.LoadUint32(&TABCOUNT))
}

func enterLoggingContext() {
	atomic.AddUint32(&TABCOUNT, 1)
	tc := tabcount()
	log.SetPrefix(strings.Repeat("\t", tc))
}

func leaveLoggingContext() {
	tc := tabcount()
	tc--

	if tc < 0 {
		atomic.StoreUint32(&TABCOUNT, 0)
		tc = 0
	} else {
		atomic.StoreUint32(&TABCOUNT, uint32(tc))
	}
	log.SetPrefix(strings.Repeat("\t", tc))
}

func logf(format string, others ...interface{}) {
	log.Printf(format, others...)
}

func recoverFrom(format string, attrs ...interface{}) {
	if r := recover(); r != nil {
		log.Printf(format, attrs...)
		panic(r)
	}
}


================================================
FILE: pos/errors.go
================================================
package pos

import "fmt"

type componentUnavailable string

func (c componentUnavailable) Error() string     { return fmt.Sprintf("%v unavailable", c) }
func (c componentUnavailable) Component() string { return string(c) }


================================================
FILE: pos/features.go
================================================
package pos

import (
	"bytes"
	"fmt"

	"github.com/chewxy/lingo"
)

type featureType byte

//go:generate stringer -type=featureType
const (
	bias featureType = iota

	ithWord_
	nextWord_
	next2Word_

	ithSuffix3_
	ithPrefix1_

	prevPOSTag_
	prev2POSTag_
	prevSuffix3_
	nextSuffix3_

	ithShape_
	ithCluster_
	nextCluster_
	next2Cluster_
	prevCluster_
	prev2Cluster_

	ithFlags_
	nextFlags_
	next2Flags_
	prevFlags_
	prev2Flags_

	prevLemma_prevPOSTag
	prevPOSTag_ithWord
	prevPOSTag_prev2POSTag
	prev2Lemma_prev2POSTag

	MAXFEATURETYPE
)

var featCtxMap = map[featureType]contextType{
	ithWord_:   ithWord,
	nextWord_:  nextWord,
	next2Word_: next2Word,

	ithSuffix3_: ithSuffix3,
	ithPrefix1_: ithPrefix1,

	prevPOSTag_:  prevPOSTag,
	prev2POSTag_: prev2POSTag,
	prevSuffix3_: prevSuffix3,
	nextSuffix3_: nextSuffix3,

	ithShape_:     ithShape,
	ithCluster_:   ithCluster,
	nextCluster_:  nextCluster,
	next2Cluster_: next2Cluster,
	prevCluster_:  prevCluster,
	prev2Cluster_: prev2Cluster,

	ithFlags_:   ithFlags,
	nextFlags_:  nextFlags,
	next2Flags_: next2Flags,
	prevFlags_:  prevFlags,
	prev2Flags_: prev2Flags,
}

type feature interface {
	FeatType() featureType
	String() string
}

type singleFeature struct {
	featureType
	value string
}

func (sf singleFeature) FeatType() featureType { return sf.featureType }
func (sf singleFeature) String() string {
	return fmt.Sprintf("singleFeature{%v, %q}", sf.featureType, sf.value)
}

type tupleFeature struct {
	featureType
	value1 string
	value2 string
}

func (tf tupleFeature) FeatType() featureType { return tf.featureType }
func (tf tupleFeature) String() string {
	return fmt.Sprintf("tupleFeature {%v, %q, %q}", tf.featureType, tf.value1, tf.value2)
}

type featureMap map[feature]float64

func (fm featureMap) String() string {
	var buf bytes.Buffer
	for f := range fm {
		fmt.Fprintf(&buf, "%s: 1,\n", f)
	}
	return buf.String()
}

func (fm *featureMap) add(f feature) { (*fm)[f]++ }

type sfFeatures [prevLemma_prevPOSTag]singleFeature
type tfFeatures [MAXFEATURETYPE - prevLemma_prevPOSTag]tupleFeature

func fillFromContext(c contextMap) (sf sfFeatures, tf tfFeatures) {
	for i := bias; i < prevLemma_prevPOSTag; i++ {
		sf[i] = singleFeature{i, c[featCtxMap[i]]}
	}

	const last = prevLemma_prevPOSTag
	tf[prevLemma_prevPOSTag-last] = tupleFeature{prevLemma_prevPOSTag, c[prevLemma], c[prevPOSTag]}
	tf[prevPOSTag_ithWord-last] = tupleFeature{prevPOSTag_ithWord, c[prevPOSTag], c[ithWord]}
	tf[prevPOSTag_prev2POSTag-last] = tupleFeature{prevPOSTag_prev2POSTag, c[prevPOSTag], c[prev2POSTag]}
	tf[prev2Lemma_prev2POSTag-last] = tupleFeature{prev2Lemma_prev2POSTag, c[prev2Lemma], c[prev2POSTag]}
	return
}

func getFeatures(s lingo.AnnotatedSentence, i int) (sfFeatures, tfFeatures) {
	length := len(s)

	// set up context defaults
	prev2 := lingo.NullAnnotation()
	prev := lingo.NullAnnotation()
	ith := s[i]
	next := lingo.NullAnnotation()
	next2 := lingo.NullAnnotation()

	if i-1 >= 0 {
		prev = s[i-1]
	}
	if i-2 >= 0 {
		prev2 = s[i-2]
	}
	if i+1 < length {
		next = s[i+1]
	}
	if i+2 < length {
		next2 = s[i+2]
	}

	c := getContext(prev2, prev, ith, next, next2)

	return fillFromContext(c)
}


================================================
FILE: pos/features_test.go
================================================
// +build stanfordtags

package pos

import (
	"testing"

	"github.com/chewxy/lingo"
	"github.com/stretchr/testify/assert"
)

func TestGetFeatures(t *testing.T) {
	assert := assert.New(t)

	// test two word sentence
	s2 := lingo.AnnotatedSentence{
		lingo.AnnotationFromLexTag(lingo.Lexeme{"most", lingo.Word, -1, -1}, lingo.RBS, dummyFix{}),
		lingo.AnnotationFromLexTag(lingo.Lexeme{"populous", lingo.Word, -1, -1}, lingo.X, dummyFix{}),
	}

	featMap := getFeatures(s2, 0)
	expectedFM := featureMap{
		singleFeature{bias, ""}:                       1,
		singleFeature{ithWord_, "most"}:               1,
		tupleFeature{prevLemma_prevPOSTag, "", "X"}:   1,
		tupleFeature{prev2Lemma_prev2POSTag, "", "X"}: 1,
		singleFeature{nextWord_, "populous"}:          1,
		singleFeature{next2Word_, ""}:                 1,

		singleFeature{ithSuffix3_, "ost"}: 1,
		singleFeature{ithPrefix1_, "m"}:   1,

		singleFeature{prevPOSTag_, "X"}:                 1,
		singleFeature{prev2POSTag_, "X"}:                1,
		tupleFeature{prevPOSTag_prev2_POSTag, "X", "X"}: 1,
		tupleFeature{prevPOSTag_ithWord, "X", "most"}:   1,
		singleFeature{prevSuffix3_, ""}:                 1,
		singleFeature{nextSuffix3_, "ous"}:              1,

		singleFeature{ithShape_, "xxxx"}:  1,
		singleFeature{ithCluster_, "0"}:   1,
		singleFeature{nextCluster_, "0"}:  1,
		singleFeature{next2Cluster_, "0"}: 1,
		singleFeature{prevCluster_, "0"}:  1,
		singleFeature{prev2Cluster_, "0"}: 1,

		singleFeature{ithFlags_, "01000000010110"}:   1,
		singleFeature{nextFlags_, "00000000010110"}:  1,
		singleFeature{next2Flags_, "00000000000000"}: 1,
		singleFeature{prevFlags_, "00000000000000"}:  1,
		singleFeature{prev2Flags_, "00000000000000"}: 1,
	}
	assert.EqualValues(expectedFM, featMap, "Want: \n%v\n\nGot: \n%v", expectedFM, featMap)

	// test five word sentence
	s5 := lingo.AnnotatedSentence{
		lingo.AnnotationFromLexTag(lingo.Lexeme{"most", lingo.Word, -1, -1}, lingo.RBS, dummyFix{}),
		lingo.AnnotationFromLexTag(lingo.Lexeme{"populous", lingo.Word, -1, -1}, lingo.X, dummyFix{}),
		lingo.AnnotationFromLexTag(lingo.Lexeme{"state", lingo.Word, -1, -1}, lingo.X, dummyFix{}),
		lingo.AnnotationFromLexTag(lingo.Lexeme{"in", lingo.Word, -1, -1}, lingo.X, dummyFix{}),
		lingo.AnnotationFromLexTag(lingo.Lexeme{"America", lingo.Word, -1, -1}, lingo.X, dummyFix{}),
	}

	featMap = getFeatures(s5, 0) // no prev

	expectedFM = featureMap{
		singleFeature{bias, ""}:                       1,
		singleFeature{ithWord_, "most"}:               1,
		tupleFeature{prevLemma_prevPOSTag, "", "X"}:   1,
		tupleFeature{prev2Lemma_prev2POSTag, "", "X"}: 1,
		singleFeature{nextWord_, "populous"}:          1,
		singleFeature{next2Word_, "state"}:            1,

		singleFeature{ithSuffix3_, "ost"}: 1,
		singleFeature{ithPrefix1_, "m"}:   1,

		singleFeature{prevPOSTag_, "X"}:                 1,
		singleFeature{prev2POSTag_, "X"}:                1,
		tupleFeature{prevPOSTag_prev2_POSTag, "X", "X"}: 1,
		tupleFeature{prevPOSTag_ithWord, "X", "most"}:   1,
		singleFeature{prevSuffix3_, ""}:                 1,
		singleFeature{nextSuffix3_, "ous"}:              1,

		singleFeature{ithShape_, "xxxx"}:  1,
		singleFeature{ithCluster_, "0"}:   1,
		singleFeature{nextCluster_, "0"}:  1,
		singleFeature{next2Cluster_, "0"}: 1,
		singleFeature{prevCluster_, "0"}:  1,
		singleFeature{prev2Cluster_, "0"}: 1,

		singleFeature{ithFlags_, "01000000010110"}:   1,
		singleFeature{nextFlags_, "00000000010110"}:  1,
		singleFeature{next2Flags_, "00000000010110"}: 1,
		singleFeature{prevFlags_, "00000000000000"}:  1,
		singleFeature{prev2Flags_, "00000000000000"}: 1,
	}
	assert.EqualValues(expectedFM, featMap, "Want: \n%v\n\nGot: \n%v", expectedFM, featMap)

	featMap = getFeatures(s5, 2) // has all the feats
	expectedFM = featureMap{
		singleFeature{bias, ""}:                         1,
		singleFeature{ithWord_, "state"}:                1,
		tupleFeature{prev2Lemma_prev2POSTag, "", "RBS"}: 1,
		tupleFeature{prevLemma_prevPOSTag, "", "X"}:     1,
		singleFeature{nextWord_, "in"}:                  1,
		singleFeature{next2Word_, "america"}:            1,

		singleFeature{ithSuffix3_, "ate"}: 1,
		singleFeature{ithPrefix1_, "s"}:   1,

		singleFeature{prevPOSTag_, "X"}:                   1,
		singleFeature{prev2POSTag_, "RBS"}:                1,
		tupleFeature{prevPOSTag_prev2_POSTag, "X", "RBS"}: 1,
		tupleFeature{prevPOSTag_ithWord, "X", "state"}:    1,
		singleFeature{prevSuffix3_, "ous"}:                1,
		singleFeature{nextSuffix3_, ""}:                   1,

		singleFeature{ithShape_, "xxxx"}:  1,
		singleFeature{ithCluster_, "0"}:   1,
		singleFeature{nextCluster_, "0"}:  1,
		singleFeature{next2Cluster_, "0"}: 1,
		singleFeature{prevCluster_, "0"}:  1,
		singleFeature{prev2Cluster_, "0"}: 1,

		singleFeature{ithFlags_, "00000000010110"}:   1,
		singleFeature{nextFlags_, "01000000010110"}:  1,
		singleFeature{next2Flags_, "00000010000110"}: 1,
		singleFeature{prevFlags_, "00000000010110"}:  1,
		singleFeature{prev2Flags_, "01000000010110"}: 1,
	}
	assert.EqualValues(expectedFM, featMap, "Want: \n%v\n\nGot: \n%v", expectedFM, featMap)

	featMap = getFeatures(s5, 4) // no nexts

	expectedFM = featureMap{
		singleFeature{bias, ""}:                       1,
		singleFeature{ithWord_, "america"}:            1,
		tupleFeature{prev2Lemma_prev2POSTag, "", "X"}: 1,
		tupleFeature{prevLemma_prevPOSTag, "", "X"}:   1,
		singleFeature{nextWord_, ""}:                  1,
		singleFeature{next2Word_, ""}:                 1,

		singleFeature{ithSuffix3_, "ica"}: 1,
		singleFeature{ithPrefix1_, "A"}:   1,

		singleFeature{prevPOSTag_, "X"}:                  1,
		singleFeature{prev2POSTag_, "X"}:                 1,
		tupleFeature{prevPOSTag_prev2_POSTag, "X", "X"}:  1,
		tupleFeature{prevPOSTag_ithWord, "X", "america"}: 1,
		singleFeature{prevSuffix3_, ""}:                  1,
		singleFeature{nextSuffix3_, ""}:                  1,

		singleFeature{ithShape_, "Xxxxx"}: 1,
		singleFeature{ithCluster_, "0"}:   1,
		singleFeature{nextCluster_, "0"}:  1,
		singleFeature{next2Cluster_, "0"}: 1,
		singleFeature{prevCluster_, "0"}:  1,
		singleFeature{prev2Cluster_, "0"}: 1,

		singleFeature{ithFlags_, "00000010000110"}:   1,
		singleFeature{nextFlags_, "00000000000000"}:  1,
		singleFeature{next2Flags_, "00000000000000"}: 1,
		singleFeature{prevFlags_, "01000000010110"}:  1,
		singleFeature{prev2Flags_, "00000000010110"}: 1,
	}

	assert.EqualValues(expectedFM, featMap, "Want: \n%v\n\nGot: \n%v", expectedFM, featMap)
}


================================================
FILE: pos/featuretype_string.go
================================================
// generated by stringer -type=featureType; DO NOT EDIT

package pos

import "fmt"

const _featureType_name = "biasithWord_prevLemma_prevPOSTagprev2Lemma_prev2POSTagnextWord_next2Word_ithSuffix3_ithPrefix1_prevPOSTag_prev2POSTag_prevPOSTag_prev2_POSTagprevPOSTag_ithWordprevSuffix3_nextSuffix3_ithShape_ithCluster_nextCluster_next2Cluster_prevCluster_prev2Cluster_ithFlags_nextFlags_next2Flags_prevFlags_prev2Flags_MAXFEATURETYPE"

var _featureType_index = [...]uint16{0, 4, 12, 32, 54, 63, 73, 84, 95, 106, 118, 141, 159, 171, 183, 192, 203, 215, 228, 240, 253, 262, 272, 283, 293, 304, 318}

func (i featureType) String() string {
	if i >= featureType(len(_featureType_index)-1) {
		return fmt.Sprintf("featureType(%d)", i)
	}
	return _featureType_name[_featureType_index[i]:_featureType_index[i+1]]
}


================================================
FILE: pos/models.go
================================================
package pos

import (
	"bufio"
	"encoding/gob"
	"io"
	"os"

	"github.com/chewxy/lingo"
)

// Model is the model that the POS Tagger runs on.
type Model struct {
	*perceptron
	cachedTags map[string]lingo.POSTag
}

// Save saves the model
func (m *Model) Save(filename string) error {
	f, err := os.Create(filename)
	if err != nil {
		return err
	}
	return m.SaveWriter(f)
}

func (m *Model) SaveWriter(f io.WriteCloser) error {
	defer f.Close()

	w := bufio.NewWriter(f)
	defer w.Flush()

	encoder := gob.NewEncoder(w)

	if err := encoder.Encode(m.perceptron); err != nil {
		return err
	}

	if err := encoder.Encode(m.cachedTags); err != nil {
		return err
	}

	return nil

}

func Load(filename string) (*Model, error) {
	f, err := os.Open(filename)
	if err != nil {
		return nil, err
	}
	return LoadReader(f)
}

func LoadReader(rd io.ReadCloser) (*Model, error) {
	defer rd.Close()

	r := bufio.NewReader(rd)
	decoder := gob.NewDecoder(r)

	m := &Model{
		perceptron: newPerceptron(),
	}
	if err := decoder.Decode(m.perceptron); err != nil {
		return nil, err
	}

	if err := decoder.Decode(&m.cachedTags); err != nil {
		return nil, err
	}

	return m, nil

}

func (p *Tagger) Load(filename string) error {
	m, err := Load(filename)
	if err != nil {
		return err
	}
	p.Model = m
	return nil
}


================================================
FILE: pos/models_test.go
================================================
package pos

import (
	"os"
	"strings"
	"testing"

	"github.com/chewxy/lingo/treebank"
	"github.com/stretchr/testify/assert"
)

func TestSaveLoad(t *testing.T) {
	pt := New()
	sentences := treebank.ReadConllu(strings.NewReader(conllu))

	pt.Train(sentences, 5)
	pt.Save("test.dat")

	pt2 := New()
	if err := pt2.Load("test.dat"); err != nil {
		os.Remove("test.dat")
		t.Fatal(err)
	}

	assert := assert.New(t)

	assert.Equal(pt.perceptron, pt2.perceptron, "POSTaggers' perceptrons are different:%p %p", pt.perceptron, pt2.perceptron)
	assert.Equal(pt.cachedTags, pt2.cachedTags, "POSTaggers' cachedTags are different")

	// cleanup
	os.Remove("test.dat")
}


================================================
FILE: pos/perceptron.go
================================================
package pos

import "github.com/chewxy/lingo"

type perceptron struct {
	// weights map[feature]*[lingo.MAXTAG]float64 // it's a pointer to a static array because map values are immutable, and cannot be edited

	weightsSF map[singleFeature]*[lingo.MAXTAG]float64
	weightsTF map[tupleFeature]*[lingo.MAXTAG]float64

	totals map[fctuple]float64
	steps  map[fctuple]float64

	instancesSeen float64
}

// feature-class tuple is a tuple that contains a feature and a class. This makes calculation of the averaging easier
type fctuple struct {
	feature
	lingo.POSTag
}

func newPerceptron() *perceptron {
	return &perceptron{
		// weights: make(map[feature]*[lingo.MAXTAG]float64),

		weightsSF: make(map[singleFeature]*[lingo.MAXTAG]float64),
		weightsTF: make(map[tupleFeature]*[lingo.MAXTAG]float64),

		totals: make(map[fctuple]float64),
		steps:  make(map[fctuple]float64),
	}
}

func (p *perceptron) updateWeightsSF(f singleFeature, tag lingo.POSTag, weight, value float64) {
	tuple := fctuple{f, tag}
	p.totals[tuple] += (p.instancesSeen - p.steps[tuple]) * weight
	p.steps[tuple] = p.instancesSeen

	if _, ok := p.weightsSF[f]; !ok {
		p.weightsSF[f] = new([lingo.MAXTAG]float64)
	}
	p.weightsSF[f][tag] = weight + value
}

func (p *perceptron) updateWeightsTF(f tupleFeature, tag lingo.POSTag, weight, value float64) {
	tuple := fctuple{f, tag}
	p.totals[tuple] += (p.instancesSeen - p.steps[tuple]) * weight
	p.steps[tuple] = p.instancesSeen

	if _, ok := p.weightsTF[f]; !ok {
		p.weightsTF[f] = new([lingo.MAXTAG]float64)
	}
	p.weightsTF[f][tag] = weight + value
}

func (p *perceptron) update(guess, truth lingo.POSTag, sf sfFeatures, tf tfFeatures) {
	p.instancesSeen++
	if truth == guess {
		return
	}

	for _, f := range sf {
		var truthValue float64
		var guessValue float64

		if weights, ok := p.weightsSF[f]; ok {
			truthValue = weights[truth]
			guessValue = weights[guess]
		}

		p.updateWeightsSF(f, truth, truthValue, 1)
		p.updateWeightsSF(f, guess, guessValue, -1)
	}

	for _, f := range tf {
		var truthValue float64
		var guessValue float64

		if weights, ok := p.weightsTF[f]; ok {
			truthValue = weights[truth]
			guessValue = weights[guess]
		}

		p.updateWeightsTF(f, truth, truthValue, 1)
		p.updateWeightsTF(f, guess, guessValue, -1)
	}
}

func (p *perceptron) predict(sf sfFeatures, tf tfFeatures) lingo.POSTag {
	var scores [lingo.MAXTAG]float64
	for _, f := range sf {
		if weights, ok := p.weightsSF[f]; ok {
			for label, weight := range weights {
				scores[label] += weight
			}
		}
	}

	for _, f := range tf {
		if weights, ok := p.weightsTF[f]; ok {
			for label, weight := range weights {
				scores[label] += weight
			}
		}
	}

	return maxScore(&scores)
}

func (p *perceptron) average() {
	for f, weights := range p.weightsSF {
		for c, weight := range weights {
			tuple := fctuple{f, lingo.POSTag(c)}
			total := p.totals[tuple]

			total += (p.instancesSeen - p.steps[tuple]) * weight
			avg := total / p.instancesSeen

			weights[c] = avg
		}
	}

	for f, weights := range p.weightsTF {
		for c, weight := range weights {
			tuple := fctuple{f, lingo.POSTag(c)}
			total := p.totals[tuple]

			total += (p.instancesSeen - p.steps[tuple]) * weight
			avg := total / p.instancesSeen

			weights[c] = avg
		}
	}
}


================================================
FILE: pos/perceptron_io.go
================================================
package pos

import (
	"bytes"
	"encoding/gob"
)

/* Feature Gob interface */

func (sf singleFeature) GobEncode() ([]byte, error) {
	var buf bytes.Buffer
	encoder := gob.NewEncoder(&buf)

	if err := encoder.Encode(sf.featureType); err != nil {
		return nil, err
	}

	if err := encoder.Encode(sf.value); err != nil {
		return nil, err
	}

	return buf.Bytes(), nil
}

func (sf *singleFeature) GobDecode(buf []byte) error {
	b := bytes.NewBuffer(buf)

	decoder := gob.NewDecoder(b)

	if err := decoder.Decode(&sf.featureType); err != nil {
		return err
	}

	if err := decoder.Decode(&sf.value); err != nil {
		return err
	}

	return nil
}

func (tf tupleFeature) GobEncode() ([]byte, error) {
	var buf bytes.Buffer
	encoder := gob.NewEncoder(&buf)

	if err := encoder.Encode(tf.featureType); err != nil {
		return nil, err
	}

	if err := encoder.Encode(tf.value1); err != nil {
		return nil, err
	}

	if err := encoder.Encode(tf.value2); err != nil {
		return nil, err
	}

	return buf.Bytes(), nil
}

func (tf *tupleFeature) GobDecode(buf []byte) error {
	b := bytes.NewBuffer(buf)

	decoder := gob.NewDecoder(b)

	if err := decoder.Decode(&tf.featureType); err != nil {
		return err
	}

	if err := decoder.Decode(&tf.value1); err != nil {
		return err
	}

	if err := decoder.Decode(&tf.value2); err != nil {
		return err
	}

	return nil
}

/* fctuple Gob Interface */
func (fc fctuple) GobEncode() ([]byte, error) {
	var buf bytes.Buffer
	encoder := gob.NewEncoder(&buf)

	if err := encoder.Encode(&fc.feature); err != nil {
		return nil, err
	}

	if err := encoder.Encode(fc.POSTag); err != nil {
		return nil, err
	}

	return buf.Bytes(), nil
}

func (fc *fctuple) GobDecode(buf []byte) error {
	b := bytes.NewBuffer(buf)

	decoder := gob.NewDecoder(b)
	if err := decoder.Decode(&fc.feature); err != nil {
		return err
	}

	if err := decoder.Decode(&fc.POSTag); err != nil {
		return err
	}
	return nil
}

/* Perceptron Gob Interface */

func (p *perceptron) GobEncode() ([]byte, error) {
	var buf bytes.Buffer
	encoder := gob.NewEncoder(&buf)

	// if err := encoder.Encode(&p.weights); err != nil {
	// 	return nil, err
	// }

	if err := encoder.Encode(&p.weightsSF); err != nil {
		return nil, err
	}
	if err := encoder.Encode(&p.weightsTF); err != nil {
		return nil, err
	}

	if err := encoder.Encode(&p.totals); err != nil {
		return nil, err
	}

	if err := encoder.Encode(&p.steps); err != nil {
		return nil, err
	}

	if err := encoder.Encode(p.instancesSeen); err != nil {
		return nil, err
	}

	return buf.Bytes(), nil
}

func (p *perceptron) GobDecode(buf []byte) error {
	b := bytes.NewBuffer(buf)
	decoder := gob.NewDecoder(b)

	// if err := decoder.Decode(&p.weights); err != nil {
	// 	return err
	// }

	if err := decoder.Decode(&p.weightsSF); err != nil {
		return err
	}

	if err := decoder.Decode(&p.weightsTF); err != nil {
		return err
	}

	if err := decoder.Decode(&p.totals); err != nil {
		return err
	}

	if err := decoder.Decode(&p.steps); err != nil {
		return err
	}

	if err := decoder.Decode(&p.instancesSeen); err != nil {
		return err
	}

	return nil
}

func init() {
	gob.Register(singleFeature{})
	gob.Register(tupleFeature{})
}


================================================
FILE: pos/perceptron_io_test.go
================================================
// +build stanfordtags

package pos

import (
	"bytes"
	"encoding/gob"
	"testing"

	"github.com/chewxy/lingo"
	"github.com/stretchr/testify/assert"
)

func TestFeatureSerialization(t *testing.T) {
	var f, f2 feature
	f = singleFeature{ithWord_, "hello"}
	f2 = tupleFeature{ithWord_, "hello", "world"}

	var buf bytes.Buffer
	encoder := gob.NewEncoder(&buf)
	decoder := gob.NewDecoder(&buf)

	if err := encoder.Encode(&f); err != nil {
		t.Fatal(err)
	}

	if err := encoder.Encode(&f2); err != nil {
		t.Fatal(err)
	}

	var decodedF, decodedF2 feature
	if err := decoder.Decode(&decodedF); err != nil {
		t.Fatal(err)
	}

	if err := decoder.Decode(&decodedF2); err != nil {
		t.Fatal(err)
	}

	assert.Equal(t, f, decodedF, "feature not deserialized properly")
	assert.Equal(t, f2, decodedF2, "feature not deserialized properly")
}

func TestPerceptron_Serialize(t *testing.T) {
	p := newPerceptron()

	// set up a dummy weight
	f := singleFeature{ithWord_, "hello"}
	w := new([lingo.MAXTAG]float64)
	w[lingo.NN] = 0.5
	w[lingo.VB] = 0.1
	p.weights[f] = w

	fc := fctuple{f, lingo.VB}
	p.totals[fc] = 0.1337
	p.steps[fc] = 0.65535

	p.instancesSeen = 1022

	var buf bytes.Buffer
	encoder := gob.NewEncoder(&buf)
	decoder := gob.NewDecoder(&buf)

	// encode
	if err := encoder.Encode(p); err != nil {
		t.Fatal(err)
	}

	// decode
	p2 := newPerceptron()
	if err := decoder.Decode(p2); err != nil {
		t.Fatal(err)
	}

	assert := assert.New(t)

	assert.Equal(p.weights, p2.weights, "The weights have not been deserialized properly")
	assert.Equal(p.totals, p2.totals, "Totals have not been deserialized properly")
	assert.Equal(p.steps, p2.steps, "Steps have not been deserialized properly")
	assert.Equal(p.instancesSeen, p2.instancesSeen, "InstancesSeen not deserialized properly")
}


================================================
FILE: pos/postagger.go
================================================
package pos

import (
	"github.com/chewxy/lingo"
	"github.com/chewxy/lingo/corpus"
	"github.com/chewxy/lingo/treebank"
)

// Tagger is the object that tags an incoming channel of lexemes,
// and outputs a channel of AnnotatedSentence. Each of the Annotation
// are tagged with the POSTag
//
// The core of the Tagger is the perceptron (unexported).
//
// A large percentage of how this POS Tagger works is inspired by Mathhew Honnibal's work in SpaCy
type Tagger struct {
	*Model

	Input    chan lingo.Lexeme
	Output   chan lingo.AnnotatedSentence
	progress chan Progress

	sentences chan lingo.AnnotatedSentence

	lingo.Lemmatizer
	lingo.Stemmer
	corpus   *corpus.Corpus
	clusters map[string]lingo.Cluster // this map is safe for concurrent access because it's readonly
}

// ConsOpt is a construction option for a Tagger
type ConsOpt func(*Tagger)

// WithCorpus creates a *Tagger with an existing Corpus
func WithCorpus(c *corpus.Corpus) ConsOpt {
	fn := func(p *Tagger) {
		p.corpus = c
	}
	return fn
}

// WithLemmatizer creates a *Tagger with a lemmatizer.
// If no lemmatizer is passed into the POSTagger, then the lemmatization process will be skipped, and the POSTagger will be less accurate
func WithLemmatizer(l lingo.Lemmatizer) ConsOpt {
	fn := func(p *Tagger) {
		p.Lemmatizer = l
	}
	return fn
}

// WithStemmer creates a *Tagger with a stemmer.
// If no stemmer is passed in, then the stemming will be skipped, and the POSTagger will be less accurate
func WithStemmer(s lingo.Stemmer) ConsOpt {
	fn := func(p *Tagger) {
		p.Stemmer = s
	}
	return fn
}

// WithCluster creates a *Tagger with a brown cluster corpus (a map of strings to the brown clusters).
// If no brown cluster corpus was passed in, the cluster won't be set, and the POSTagger will be less accurate
func WithCluster(c map[string]lingo.Cluster) ConsOpt {
	fn := func(p *Tagger) {
		p.clusters = c
	}
	return fn
}

// WithModel creates a *Tagger with the specified model
func WithModel(m *Model) ConsOpt {
	fn := func(p *Tagger) {
		p.Model = m
	}
	return fn
}

// New creates a new *Tagger
func New(opts ...ConsOpt) *Tagger {
	p := &Tagger{
		Output: make(chan lingo.AnnotatedSentence),

		sentences: make(chan lingo.AnnotatedSentence),
	}

	for _, opt := range opts {
		opt(p)
	}

	if p.Model == nil {
		p.Model = &Model{perceptron: newPerceptron()}
		p.cachedTags = make(map[string]lingo.POSTag)
	}

	return p
}

// Clone() makes a copy of a POSTagger
func (p *Tagger) Clone() *Tagger {
	return &Tagger{
		Model:  p.Model,
		corpus: p.corpus,

		Output: make(chan lingo.AnnotatedSentence),

		sentences: make(chan lingo.AnnotatedSentence),

		Lemmatizer: p.Lemmatizer,
		Stemmer:    p.Stemmer,
		clusters:   p.clusters,
	}
}

// Run is used to tag a sentence. Lexemes arrive from the lexer in a channel (*Tagger.Input), and an annotated sentence is sent down the Output channel
func (p *Tagger) Run() {
	defer close(p.Output)

	go p.getSentences()

	for s := range p.sentences {
		length := len(s)
		if length == 0 {
			continue
		}
		for i, a := range s {
			tag, ok := p.shortcut(a.Lexeme)
			if !ok {
				sf, tf := getFeatures(s, i)
				tag = p.perceptron.predict(sf, tf)
			}

			p.setTag(a, tag)
		}
		p.Output <- s
	}
}

// Lemmatize implements the lingo.Lemmatize interface. It however, defers the actual doing of the job to the Lemmatizer.
func (p *Tagger) Lemmatize(a string, pt lingo.POSTag) ([]string, error) {
	if p.Lemmatizer == nil {
		return nil, componentUnavailable("lemmatizer")
	}
	return p.Lemmatizer.Lemmatize(a, pt)
}

// Stem implements the lingo.Stemmer interface. It however, defers the actual stemming to the stemmer passed in.
func (p *Tagger) Stem(a string) (string, error) {
	if p.Stemmer == nil {
		return "", componentUnavailable("stemmer")
	}
	return p.Stemmer.Stem(a)
}

// Clusters implements the lingo.AnnotationFixer interface.
func (p *Tagger) Clusters() (map[string]lingo.Cluster, error) {
	if p.clusters == nil {
		return nil, componentUnavailable("clusters")
	}
	return p.clusters, nil
}

// Progress creates and returns a channel of progress. By default the progress channel isn't created, and no progress info is sent
func (p *Tagger) Progress() <-chan Progress {
	if p.progress == nil {
		p.progress = make(chan Progress)
	}
	return p.progress
}

// Train trains a POSTagger, given a bunch of SentenceTags
func (p *Tagger) Train(sentences []treebank.SentenceTag, iterations int) {
	if p.progress != nil {
		defer func() {
			close(p.progress)
			p.progress = nil
		}()
	}

	p.fillCache(sentences)

	// Somehow sentenceTag.AnnotatedSentence() is memory leaky.
	// As a result, the more training iterations there is, the more memory is used and not released
	// hence the cache is necessary.
	cache := make(map[string]lingo.AnnotatedSentence)
	for iter := 0; iter < iterations; iter++ {
		c := 0
		n := 0
		shortcutted := 0

		var s lingo.AnnotatedSentence
		for _, sentenceTag := range sentences {
			tags := []lingo.POSTag{lingo.ROOT_TAG}
			tags = append(tags, sentenceTag.Tags...)

			var ok bool
			if s, ok = cache[sentenceTag.String()]; !ok {
				s = sentenceTag.AnnotatedSentence(p) // the fixer is used to extract cluster information, etc into the *Annotation
				cache[sentenceTag.String()] = s
			}

			length := len(s)
			if length == 0 {
				continue
			}

			for _, a := range s {
				if a == lingo.RootAnnotation() {
					continue
				}
				a.POSTag = lingo.X
			}

			for i, a := range s {
				// processing
				truth := tags[i]

				guess, ok := p.shortcut(a.Lexeme)
				if !ok {
					sf, tf := getFeatures(s, i)
					guess = p.perceptron.predict(sf, tf)
					p.perceptron.update(guess, truth, sf, tf)
				} else {
					shortcutted++
				}
				p.setTag(a, guess)

				if guess == truth {
					c++
				}
				n++
			}
		}

		if iter%150 == 0 {
			p.perceptron.average()
			logf("Averaged perceptron")
		}

		if p.progress != nil {
			p.progress <- Progress{Iter: iter, Correct: c, Count: n, ShortCutted: shortcutted}
		}

		treebank.ShuffleSentenceTag(sentences)
	}
	p.perceptron.average()
}

// LoadShortcuts allows for domain specific things to be mapped into the tagger.
func (p *Tagger) LoadShortcuts(shortcuts map[string]lingo.POSTag) {
	for shortcut, tags := range shortcuts {
		p.cachedTags[shortcut] = tags
	}
}

func (p *Tagger) fillCache(sentences []treebank.SentenceTag) {
	logf("Filling Cache with %d sentences", len(sentences))

	var counter = make(map[string]map[lingo.POSTag]int)

	for _, sentenceTag := range sentences {
		s := sentenceTag.Sentence
		tags := sentenceTag.Tags

		for i, lex := range s {
			w := lex.Value
			t := tags[i]

			_, ok := counter[w]
			if !ok {
				counter[w] = make(map[lingo.POSTag]int)
			}
			counter[w][t]++
		}
	}

	freqThresh := 30
	ambiguityThresh := 0.98

	for word, tagCounter := range counter {
		var maxTag lingo.POSTag
		var max int
		var n int
		for t, c := range tagCounter {
			if c > max {
				maxTag = t
				max = c
			}
			n += c
		}

		if n >= freqThresh && float64(max)/float64(n) >= ambiguityThresh {
			p.cachedTags[word] = maxTag
		}
	}
}

func (p *Tagger) shortcut(l lingo.Lexeme) (lingo.POSTag, bool) {
	tag, ok := lingo.POSTagShortcut(l)
	if !ok {
		tag, ok = p.cachedTags[l.Value]
	}
	return tag, ok
}

func (p *Tagger) setTag(a *lingo.Annotation, tag lingo.POSTag) {
	if a == lingo.NullAnnotation() || a == lingo.RootAnnotation() || a == lingo.StartAnnotation() {
		return
	}

	a.POSTag = tag

	if lemmas, err := p.Lemmatize(a.Value, tag); err == nil && len(lemmas) > 0 {
		// sort.Strings(lemmas)
		a.Lemma = lemmas[0]
	}

	if stem, err := p.Stem(a.Value); err == nil {
		a.Stem = stem
	}
}

// Progress is just a tuple of training progress info
type Progress struct {
	Iter, Correct, Count, ShortCutted int
}


================================================
FILE: pos/release.go
================================================
// +build !debug

package pos

const BUILD_DEBUG = "POS TAGGER: Release Build"

var TABCOUNT uint32 = 0
var tracking = false

func tabcount() int                                   { return 0 }
func enterLoggingContext()                            {}
func leaveLoggingContext()                            {}
func logf(format string, others ...interface{})       {}
func recoverFrom(format string, attrs ...interface{}) {}

func (p *Tagger) ShowWeights() {}
func printShortcuts(p *Tagger) {}


================================================
FILE: pos/sentence.go
================================================
package pos

import "github.com/chewxy/lingo"

// "log"

func (p *Tagger) getSentences() {
	defer close(p.sentences)

	var sentence lingo.AnnotatedSentence
	sentence = append(sentence, lingo.RootAnnotation())

	for lexeme := range p.Input {
		if lexeme.LexemeType != lingo.EOF {
			a := lingo.NewAnnotation()
			a.Lexeme = lexeme
			if err := a.Process(p); err != nil {
				panic(err) // for now
			}
			sentence = append(sentence, a)
		} else {
			p.sentences <- sentence

			// reset
			sentence = lingo.AnnotatedSentence{lingo.RootAnnotation()}
		}

		// TODO: Sentence splitting
	}
}


================================================
FILE: pos/test_test.go
================================================
package pos

import (
	"github.com/chewxy/lingo"
	"github.com/kljensen/snowball"
)

type dummyLem struct{}

func (dummyLem) Lemmatize(s string, pt lingo.POSTag) ([]string, error) {
	if len(s) > 3 {
		return []string{
			s[:2],
		}, nil
	}
	return []string{""}, nil
}

type dummyStemmer struct{}

func (dummyStemmer) Stem(s string) (string, error) {
	return snowball.Stem(s, "english", true)
}

var clusters = map[string]lingo.Cluster{
	"TEst": 1,
	"Test": 1,
	"test": 1,
}

type dummyFix struct {
	dummyStemmer
	dummyLem
}

func (dummyFix) Clusters() (map[string]lingo.Cluster, error) { return clusters, nil }

const conllu = `1	From	from	ADP	IN	_	3	case	_	_
2	the	the	DET	DT	Definite=Def|PronType=Art	3	det	_	_
3	AP	AP	PROPN	NNP	Number=Sing	4	nmod	_	_
4	comes	come	VERB	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	_
5	this	this	DET	DT	Number=Sing|PronType=Dem	6	det	_	_
6	story	story	NOUN	NN	Number=Sing	4	nsubj	_	_
7	:	:	PUNCT	:	_	4	punct	_	_

1	President	President	PROPN	NNP	Number=Sing	2	compound	_	_
2	Bush	Bush	PROPN	NNP	Number=Sing	5	nsubj	_	_
3	on	on	ADP	IN	_	4	case	_	_
4	Tuesday	Tuesday	PROPN	NNP	Number=Sing	5	nmod	_	_
5	nominated	nominate	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	0	root	_	_
6	two	two	NUM	CD	NumType=Card	7	nummod	_	_
7	individuals	individual	NOUN	NNS	Number=Plur	5	dobj	_	_
8	to	to	PART	TO	_	9	mark	_	_
9	replace	replace	VERB	VB	VerbForm=Inf	5	advcl	_	_
10	retiring	retire	VERB	VBG	VerbForm=Ger	11	amod	_	_
11	jurists	jurist	NOUN	NNS	Number=Plur	9	dobj	_	_
12	on	on	ADP	IN	_	14	case	_	_
13	federal	federal	ADJ	JJ	Degree=Pos	14	amod	_	_
14	courts	court	NOUN	NNS	Number=Plur	11	nmod	_	_
15	in	in	ADP	IN	_	18	case	_	_
16	the	the	DET	DT	Definite=Def|PronType=Art	18	det	_	_
17	Washington	Washington	PROPN	NNP	Number=Sing	18	compound	_	_
18	area	area	NOUN	NN	Number=Sing	14	nmod	_	_
19	.	.	PUNCT	.	_	5	punct	_	_

1	Bush	Bush	PROPN	NNP	Number=Sing	2	nsubj	_	_
2	nominated	nominate	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	0	root	_	_
3	Jennifer	Jennifer	PROPN	NNP	Number=Sing	5	compound	_	_
4	M.	M.	PROPN	NNP	Number=Sing	5	compound	_	_
5	Anderson	Anderson	PROPN	NNP	Number=Sing	2	dobj	_	_
6	for	for	ADP	IN	_	11	case	_	_
7	a	a	DET	DT	Definite=Ind|PronType=Art	11	det	_	_
8	15	15	NUM	CD	NumType=Card	10	nummod	_	_
9	-	-	PUNCT	HYPH	_	10	punct	_	_
10	year	year	NOUN	NN	Number=Sing	11	compound	_	_
11	term	term	NOUN	NN	Number=Sing	2	nmod	_	_
12	as	as	ADP	IN	_	14	case	_	_
13	associate	associate	ADJ	JJ	Degree=Pos	14	amod	_	_
14	judge	judge	NOUN	NN	Number=Sing	11	nmod	_	_
15	of	of	ADP	IN	_	18	case	_	_
16	the	the	DET	DT	Definite=Def|PronType=Art	18	det	_	_
17	Superior	Superior	PROPN	NNP	Number=Sing	18	compound	_	_
18	Court	Court	PROPN	NNP	Number=Sing	14	nmod	_	_
19	of	of	ADP	IN	_	21	case	_	_
20	the	the	DET	DT	Definite=Def|PronType=Art	21	det	_	_
21	District	District	PROPN	NNP	Number=Sing	18	nmod	_	_
22	of	of	ADP	IN	_	23	case	_	_
23	Columbia	Columbia	PROPN	NNP	Number=Sing	21	nmod	_	_
24	,	,	PUNCT	,	_	2	punct	_	_
25	replacing	replace	VERB	VBG	VerbForm=Ger	2	advcl	_	_
26	Steffen	Steffen	PROPN	NNP	Number=Sing	28	compound	_	_
27	W.	W.	PROPN	NNP	Number=Sing	28	compound	_	_
28	Graae	Graae	PROPN	NNP	Number=Sing	25	dobj	_	_
29	.	.	PUNCT	.	_	2	punct	_	_

1	We	we	PRON	PRP	Case=Nom|Number=Plur|Person=1|PronType=Prs	3	nsubj	_	_
2	've	have	AUX	VBP	Mood=Ind|Tense=Pres|VerbForm=Fin	3	aux	_	_
3	grown	grow	VERB	VBN	Tense=Past|VerbForm=Part	0	root	_	_
4	up	up	ADP	RP	_	3	compound:prt	_	_
5	.	.	PUNCT	.	_	3	punct	_	_`


================================================
FILE: pos/util.go
================================================
package pos

import (
	"math"

	"github.com/chewxy/lingo"
)

func maxScore(scores *[lingo.MAXTAG]float64) lingo.POSTag {
	var maxClass lingo.POSTag
	maxVal := -math.MaxFloat64
	for c, v := range scores {
		if v > maxVal {
			maxClass = lingo.POSTag(c)
			maxVal = v
		}
	}

	return maxClass
}


================================================
FILE: pos/util_test.go
================================================
package pos

import (
	"math"
	"math/rand"
	"testing"

	"github.com/chewxy/lingo"
)

func TestMaxScore(t *testing.T) {
	rand.Seed(1337)
	scores := new([lingo.MAXTAG]float64)

	for i := range scores {
		scores[i] = rand.Float64()
		if lingo.POSTag(i) == lingo.ROOT_TAG {
			scores[i] = math.MaxFloat64
		}
	}

	tag := maxScore(scores)
	if tag != lingo.ROOT_TAG {
		t.Errorf("Expected Score #10 to be the max. Got %d instead", tag)
	}
}


================================================
FILE: sentence.go
================================================
package lingo

import (
	"bytes"
	"fmt"
	"sort"
	"strings"

	"github.com/pkg/errors"
)

/* Lexeme Sentence */
type LexemeSentence []Lexeme

func NewLexemeSentence() LexemeSentence { return LexemeSentence(make([]Lexeme, 0)) }

func (ls LexemeSentence) String() string {
	var buf bytes.Buffer
	for _, lex := range ls {
		buf.WriteString(lex.Value)
		buf.WriteString(" ")
	}
	return strings.Trim(buf.String(), " ")
}

/* Annotated Sentence */

// AnnotatedSentence is a sentence, but each word has been annotated.
type AnnotatedSentence []*Annotation

func NewAnnotatedSentence() AnnotatedSentence { return make(AnnotatedSentence, 0) }

func (as AnnotatedSentence) Clone() AnnotatedSentence {
	retVal := make(AnnotatedSentence, len(as))

	for i, a := range as {
		// don't clone rootAnnotation
		if i == 0 && a == rootAnnotation {
			retVal[i] = a
			continue
		}
		retVal[i] = a.Clone()
	}
	return retVal
}

func (as AnnotatedSentence) SetID() {
	for i, a := range as {
		if i == 0 && a == rootAnnotation {
			continue
		}
		a.ID = i
	}
}

func (as AnnotatedSentence) Fix() {
	if as[0].Lexeme == rootLexeme {
		as[0] = rootAnnotation
	}

	as.SetID()

	for _, a := range as {
		if a.Head != nil {
			if a.HeadID() == -1 && a.Head.Lexeme == rootLexeme {
				a.Head = rootAnnotation
				continue
			}
			a.SetHead(as[a.HeadID()])
		}
	}
}

func (as AnnotatedSentence) IsValid() bool {
	// check that IDs are set
	zeroes := 0
	for _, a := range as {
		if a.ID == 0 {
			zeroes++
		}
	}
	// IDs not properly set
	if zeroes > 1 {
		return false
	}

	// TODO
	// check that there is only one root

	return true
}

/* Return slices of x */

// Phrase returns the slice of the sentence. While you can do the same by simply doing as[start:end], this method returns errors instead of panicking
func (as AnnotatedSentence) Phrase(start, end int) (AnnotatedSentence, error) {
	if start < 0 {
		return nil, errors.Errorf("Start: %d < 0", start)
	}
	if end > len(as) {
		return nil, errors.Errorf("End: %d > len(as): %d", end, len(as))
	}
	return as[start:end], nil
}

// IDs returns the list of IDs in the sentence. The return value has exactly the same length as the sentence.
func (as AnnotatedSentence) IDs() []int {
	retVal := make([]int, len(as))
	for i, a := range as {
		retVal[i] = a.ID
	}
	return retVal
}

// Tags returns the POSTags of the sentence. The return value has exactly the same length as the sentence.
func (as AnnotatedSentence) Tags() []POSTag {
	retVal := make([]POSTag, len(as))
	for i, a := range as {
		retVal[i] = a.POSTag
	}
	return retVal
}

// Heads returns the head IDs of the sentence. The return value has exactly the same length as the sentence.
func (as AnnotatedSentence) Heads() []int {
	retVal := make([]int, len(as))
	for i, a := range as {
		retVal[i] = a.HeadID()
	}
	return retVal
}

// Leaves returns the *Annotations which are leaves. If the dependency hasn't been set yet, every single *Annotation is a leaf.
func (as AnnotatedSentence) Leaves() (retVal []int) {
	for i := range as {
		if len(as.Children(i)) == 0 {
			retVal = append(retVal, i)
		}
	}
	return
}

// Labels returns the DependencyTypes of the sentence. The return value has exactly the same length as the sentence.
func (as AnnotatedSentence) Labels() []DependencyType {
	retVal := make([]DependencyType, len(as))
	for i, a := range as {
		retVal[i] = a.DependencyType
	}
	return retVal
}

// StringSlice returns the original words as a slice of string. The return value has exactly the same length as the sentence.
func (as AnnotatedSentence) StringSlice() []string {
	retVal := make([]string, len(as), len(as))
	for i, a := range as {
		retVal[i] = a.Value
	}
	return retVal
}

// LoweredStringSlice returns the lowercased version of the words in the sentence as a slice of string. The return value has exactly the same length as the sentence.
func (as AnnotatedSentence) LoweredStringSlice() []string {
	retVal := make([]string, len(as), len(as))
	for i, a := range as {
		retVal[i] = a.Lowered
	}
	return retVal
}

// Lemmas returns the lemmas as as slice of string. The return value has exactly the same length as the sentence.
func (as AnnotatedSentence) Lemmas() []string {
	lemmas := make([]string, len(as))
	for i, a := range as {
		lemmas[i] = a.Lemma
	}
	return lemmas
}

// Stems returns the stems as a slice of string. The return value has exactly the same length as the sentence.
func (as AnnotatedSentence) Stems() []string {
	stems := make([]string, len(as))
	for i, a := range as {
		stems[i] = a.Stem
	}
	return stems
}

func (as AnnotatedSentence) Children(h int) (retVal []int) {
	for i, v := range as {
		if v.HeadID() == h {
			retVal = append(retVal, i)
		}
	}
	return
}

func (as AnnotatedSentence) Edges() (retVal []DependencyEdge) {
	for _, a := range as {
		var head = -1

		if a.Head != nil {
			head = a.HeadID()
		}

		if head == -1 {
			head = 0
		}
		edge := DependencyEdge{as[head], a, a.DependencyType}
		retVal = append(retVal, edge)
	}
	sort.Sort(edgeByID(retVal))
	return
}

/* To other structures */

func (as AnnotatedSentence) Dependency() *Dependency {
	return NewDependency(FromAnnotatedSentence(as))
}

func (as AnnotatedSentence) Tree() *DependencyTree {
	tracker := make([]*DependencyTree, len(as))

	rootNode := NewDependencyTree(nil, 0, rootAnnotation)
	tracker[0] = rootNode

	for i := 1; i < len(as); i++ {
		head := as[i].HeadID()
		var headDep *DependencyTree

		if head == -1 {
			headDep = rootNode
		} else {
			headDep = tracker[head]
		}

		if headDep == nil {
			// make a dependency for the head
			headDep = NewDependencyTree(nil, head, as[head])
			tracker[head] = headDep
		}

		dep := tracker[i]

		if dep == nil {
			dep = NewDependencyTree(headDep, i, as[i])
			tracker[i] = dep
		} else {
			dep.Parent = headDep
		}

		headDep.AddChild(dep)
		dep.Type = as[i].DependencyType

	}
	// return tracker[len(tracker)-1]
	// log.Printf("Tracker: %v, len(as): %d. Root: %v", tracker, len(as), rootNode.Children)
	return rootNode
}

// Stringer interface

func (as AnnotatedSentence) String() string {
	var buf bytes.Buffer
	for i, a := range as {
		buf.WriteString(fmt.Sprintf("%s/%s", a.Value, a.POSTag))
		if i < len(as)-1 {
			buf.WriteString(" ")
		}
	}
	return buf.String()
}

func (as AnnotatedSentence) ValueString() string {
	var buf bytes.Buffer
	for i, a := range as {
		buf.WriteString(a.Value)
		if i < len(as)-1 {
			buf.WriteString(" ")
		}
	}
	return buf.String()
}

func (as AnnotatedSentence) LoweredString() string {
	var buf bytes.Buffer
	for i, a := range as {
		buf.WriteString(a.Lowered)
		if i < len(as)-1 {
			buf.WriteString(" ")
		}
	}
	return buf.String()
}

func (as AnnotatedSentence) LemmaString() string {
	var buf bytes.Buffer
	for i, a := range as {
		buf.WriteString(a.Lemma)
		if i < len(as)-1 {
			buf.WriteString(" ")
		}
	}
	return buf.String()
}

func (as AnnotatedSentence) StemString() string {
	var buf bytes.Buffer
	for i, a := range as {
		buf.WriteString(a.Stem)
		if i < len(as)-1 {
			buf.WriteString(" ")
		}
	}
	return buf.String()
}

// sort interface
func (as AnnotatedSentence) Len() int           { return len(as) }
func (as AnnotatedSentence) Swap(i, j int)      { as[i], as[j] = as[j], as[i] }
func (as AnnotatedSentence) Less(i, j int) bool { return as[i].ID < as[j].ID }


================================================
FILE: sets.go
================================================
package lingo

import (
	"bytes"
	"fmt"
)

/* TAG SET */

// TagSet is a set of all the POSTags
type TagSet [MAXTAG]bool

func (ts TagSet) String() string {
	var buf bytes.Buffer
	for t, v := range ts {
		buf.WriteString(fmt.Sprintf("%v: %v\n", POSTag(t), v))
	}
	return buf.String()
}

// DependencyTypeSet is a set of all the DependencyTypes
type DependencyTypeSet [MAXDEPTYPE]bool

func (dts DependencyTypeSet) String() string {
	var buf bytes.Buffer
	for t, v := range dts {
		buf.WriteString(fmt.Sprintf("%v: %v\n", DependencyType(t), v))
	}
	return buf.String()
}


================================================
FILE: shape.go
================================================
package lingo

import (
	"bytes"
	"unicode"
)

// Shape represents the shape of a word. It's currently implemented as a string
type Shape string

func (l Lexeme) Shape() Shape {
	s := l.Value

	if len(s) > 50 {
		return Shape("Long")
	}

	var buf bytes.Buffer

	previousCharShape := ' '
	currentCharShape := ' '
	sequence := 0
	for _, c := range s {
		switch {
		case unicode.IsLetter(c):
			if unicode.IsUpper(c) {
				currentCharShape = 'X'
			} else {
				currentCharShape = 'x'
			}

		case unicode.IsDigit(c):
			currentCharShape = 'd'

		case l.LexemeType == URI:
			return Shape("URI")

		default:
			currentCharShape = c
		}

		if previousCharShape == currentCharShape {
			sequence++
		} else {
			sequence = 0 // reset the sequence
			previousCharShape = currentCharShape
		}

		if sequence < 4 {
			buf.WriteRune(currentCharShape)
		}
	}

	retVal := buf.String()

	return Shape(retVal)
}


================================================
FILE: stopwords.go
================================================
package lingo

import "strings"

const sw = `a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can cannot cant co computer con could couldnt cry de describe detail did didn do does doesn doing don done down due during each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen fify fill find fire first five for former formerly forty found four from front full further get give go had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie if in inc indeed interest into is it its itself just keep kg km last latter latterly least less ltd made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off often on once one only onto or other others otherwise our ours ourselves out over own part per perhaps please put quite rather re really regarding same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under unless until up upon us used using various very via was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves`

var stopwords = make(map[string]struct{})

func init() {
	for _, s := range strings.Split(sw, " ") {
		stopwords[s] = empty
	}

}

var specials = `-ROOT- -UNKNOWN-`

func UnescapeSpecials(word string) string {
	switch word {
	case "-LRB-":
		return "("
	case "-RRB-":
		return ")"
	case "``":
		return "\""
	case "-NULL-":
		return ""
	}
	return word
}


================================================
FILE: treebank/const_postag_stanford.go
================================================
// +build stanfordtags

package treebank

import "github.com/chewxy/lingo"

var posTagTable map[string]lingo.POSTag = map[string]lingo.POSTag{
	"X": lingo.X,

	"CC":   lingo.CC,
	"CD":   lingo.CD,
	"DT":   lingo.DT,
	"EX":   lingo.EX,
	"FW":   lingo.FW,
	"IN":   lingo.IN,
	"JJ":   lingo.JJ,
	"JJR":  lingo.JJR,
	"JJS":  lingo.JJS,
	"LS":   lingo.LS,
	"MD":   lingo.MD,
	"NN":   lingo.NN,
	"NNS":  lingo.NNS,
	"NNP":  lingo.NNP,
	"NNPS": lingo.NNPS,
	"PDT":  lingo.PDT,
	"POS":  lingo.POS,
	"PRP":  lingo.PRP,
	"PPRP": lingo.PPRP,
	"PRP$": lingo.PPRP,
	"RB":   lingo.RB,
	"RBR":  lingo.RBR,
	"RBS":  lingo.RBS,
	"RP":   lingo.RP,
	"SYM":  lingo.SYM,
	"TO":   lingo.TO,
	"UH":   lingo.UH,
	"VB":   lingo.VB,
	"VBD":  lingo.VBD,
	"VBG":  lingo.VBG,
	"VBN":  lingo.VBN,
	"VBP":  lingo.VBP,
	"VBZ":  lingo.VBZ,
	"WDT":  lingo.WDT,
	"WP":   lingo.WP,
	"PWP":  lingo.PWP,
	"WP$":  lingo.PWP,
	"WRB":  lingo.WRB,

	// punctuation
	",":     lingo.COMMA,
	"``":    lingo.OPENQUOTE,
	"''":    lingo.CLOSEQUOTE,
	".":     lingo.FULLSTOP,
	":":     lingo.COLON,
	"$":     lingo.DOLLAR,
	"#":     lingo.HASHSIGN,
	"-LRB-": lingo.LEFTBRACE,
	"-RRB-": lingo.RIGHTBRACE,

	"ADD":  lingo.ADD,
	"NFP":  lingo.NFP,
	"HYPH": lingo.HYPH,
	"GW":   lingo.GW,
	"AFX":  lingo.AFX,
	"XX":   lingo.XX,

	"-NULL-":    lingo.X,
	"-ROOT-":    lingo.ROOT_TAG,
	"-UNKNOWN-": lingo.UNKNOWN_TAG,
}


================================================
FILE: treebank/const_postag_universal.go
================================================
// +build !stanfordtags

package treebank

import "github.com/chewxy/lingo"

var posTagTable map[string]lingo.POSTag = map[string]lingo.POSTag{
	"X":     lingo.X,
	"ADJ":   lingo.ADJ,
	"ADP":   lingo.ADP,
	"ADV":   lingo.ADV,
	"AUX":   lingo.AUX,
	"CONJ":  lingo.CONJ,
	"DET":   lingo.DET,
	"INTJ":  lingo.INTJ,
	"NOUN":  lingo.NOUN,
	"NUM":   lingo.NUM,
	"PART":  lingo.PART,
	"PRON":  lingo.PRON,
	"PROPN": lingo.PROPN,
	"PUNCT": lingo.PUNCT,
	"SCONJ": lingo.SCONJ,
	"SYM":   lingo.SYM,
	"VERB":  lingo.VERB,

	"-NULL-":    lingo.X,
	"-ROOT-":    lingo.ROOT_TAG,
	"-UNKNOWN-": lingo.UNKNOWN_TAG,
}


================================================
FILE: treebank/const_rel_stanford.go
================================================
// +build stanfordrel

package treebank

import "github.com/chewxy/lingo"

var dependencyTable map[string]lingo.DependencyType = map[string]lingo.DependencyType{
	"root":       lingo.Root,
	"dep":        lingo.Dep,
	"aux":        lingo.Aux,
	"auxpass":    lingo.AuxPass,
	"cop":        lingo.Cop,
	"arg":        lingo.Arg,
	"agent":      lingo.Agent,
	"comp":       lingo.Comp,
	"acomp":      lingo.AComp,
	"ccomp":      lingo.CComp,
	"xcomp":      lingo.XComp,
	"obj":        lingo.Obj,
	"dobj":       lingo.DObj,
	"iobj":       lingo.IObj,
	"pobj":       lingo.PObj,
	"subj":       lingo.Subj,
	"nsubj":      lingo.NSubj,
	"nsubjpass":  lingo.NSubjPass,
	"csubj":      lingo.CSubj,
	"csubjpass":  lingo.CSubjPass,
	"cc":         lingo.Coordination,
	"conj":       lingo.Conj,
	"expl":       lingo.Expl,
	"mod":        lingo.Mod,
	"amod":       lingo.AMod,
	"appos":      lingo.Appos,
	"advcl":      lingo.Advcl,
	"det":        lingo.Det,
	"predet":     lingo.Predet,
	"preconj":    lingo.Preconj,
	"vmod":       lingo.Vmod,
	"mwe":        lingo.MWE,
	"mark":       lingo.Mark,
	"advmod":     lingo.AdvMod,
	"neg":        lingo.Neg,
	"rcmod":      lingo.RCMod,
	"quantmod":   lingo.QuantMod,
	"nn":         lingo.NounMod,
	"npadvmod":   lingo.NPAdvMod,
	"tmod":       lingo.TMod,
	"num":        lingo.Num,
	"number":     lingo.NumberElement,
	"prep":       lingo.Prep,
	"poss":       lingo.Poss,
	"possessive": lingo.Possessive,
	"prt":        lingo.PRT,
	"parataxis":  lingo.Parataxis,
	"goeswith":   lingo.GoesWith,
	"punct":      lingo.Punct,
	"ref":        lingo.Ref,
	"sdep":       lingo.SDep,
	"xsubj":      lingo.XSubj,

	// additional stuff not found in the original, but found in EWT
	"case":       lingo.Case,
	"compound":   lingo.Compound,
	"nmod":       lingo.NMod,
	"discourse":  lingo.Discourse,
	"nummod":     lingo.NumMod,
	"relcl":      lingo.RelCl,
	"nfincl":     lingo.NFinCl,
	"nmod:poss":  lingo.NMod_Poss,
	"nmod:npmod": lingo.NMod_NPMod,
	"vocative":   lingo.Vocative,
	"list":       lingo.List,
	"mwprep":     lingo.MWPrep,
	"remnant":    lingo.Remnant,
	"acl":        lingo.Acl,
	"npmod":      lingo.NPMod,
	"mdvod":      lingo.MDVod,
	"detmod":     lingo.DetMod,

	// found in NNParser
	"pcomp": lingo.PComp,

	"-NULL-": lingo.Dep,
}


================================================
FILE: treebank/const_rel_universal.go
================================================
// +build !stanfordrel

package treebank

import "github.com/chewxy/lingo"

var dependencyTable map[string]lingo.DependencyType = map[string]lingo.DependencyType{
	"dep":          lingo.Dep,
	"root":         lingo.Root,
	"nsubj":        lingo.NSubj,
	"nsubjpass":    lingo.NSubjPass,
	"dobj":         lingo.DObj,
	"iobj":         lingo.IObj,
	"csubj":        lingo.CSubj,
	"csubjpass":    lingo.CSubjPass,
	"ccomp":        lingo.CComp,
	"xcomp":        lingo.XComp,
	"nummod":       lingo.NumMod,
	"appos":        lingo.Appos,
	"nmod":         lingo.NMod,
	"acl":          lingo.ACl,
	"acl:relcl":    lingo.ACl_RelCl,
	"det":          lingo.Det,
	"det:predet":   lingo.Det_PreDet,
	"amod":         lingo.AMod,
	"neg":          lingo.Neg,
	"case":         lingo.Case,
	"nmod:npmod":   lingo.NMod_NPMod,
	"nmod:tmod":    lingo.NMod_TMod,
	"nmod:poss":    lingo.NMod_Poss,
	"advcl":        lingo.AdvCl,
	"advmod":       lingo.AdvMod,
	"compound":     lingo.Compound,
	"compound:prt": lingo.Compound_Part,
	"name":         lingo.Name,
	"mwe":          lingo.MWE,
	"foreign":      lingo.Foreign,
	"goeswith":     lingo.GoesWith,
	"list":         lingo.List,
	"dislocated":   lingo.Dislocated,
	"parataxis":    lingo.Parataxis,
	"remnant":      lingo.Remnant,
	"reparandum":   lingo.Reparandum,
	"vocative":     lingo.Vocative,
	"discourse":    lingo.Discourse,
	"expl":         lingo.Expl,
	"aux":          lingo.Aux,
	"auxpass":      lingo.AuxPass,
	"cop":          lingo.Cop,
	"mark":         lingo.Mark,
	"punct":        lingo.Punct,
	"conj":         lingo.Conj,
	"cc":           lingo.Coordination,
	"cc:preconj":   lingo.CC_PreConj, // https://github.com/UniversalDependencies/docs/issues/221
	"conj:preconj": lingo.CC_PreConj, // https://github.com/UniversalDependencies/docs/issues/221

	"-NULL-": lingo.NoDepType,
}


================================================
FILE: treebank/sentenceTag.go
================================================
package treebank

import (
	"math/rand"

	"github.com/chewxy/lingo"
)

// SentenceTag is a struc that holds a sentence, tags, heads and labels
type SentenceTag struct {
	Sentence lingo.LexemeSentence
	Tags     []lingo.POSTag
	Heads    []int
	Labels   []lingo.DependencyType
}

func (s SentenceTag) AnnotatedSentence(f lingo.AnnotationFixer) lingo.AnnotatedSentence {
	retVal := lingo.NewAnnotatedSentence()
	retVal = append(retVal, lingo.RootAnnotation())

	for i, lex := range s.Sentence {
		a := lingo.NewAnnotation()
		a.Lexeme = lex
		a.POSTag = s.Tags[i]
		a.DependencyType = s.Labels[i]

		// should panic, because SentenceTag is only ever used during training
		if err := a.Process(f); err != nil {
			panic(err)
		}

		retVal = append(retVal, a)
	}

	// add heads
	for i, a := range retVal {
		if i == 0 {
			continue
		}
		a.SetHead(retVal[s.Heads[i-1]])
	}

	retVal.Fix()

	return retVal
}

func (s SentenceTag) Dependency(f lingo.AnnotationFixer) *lingo.Dependency {
	sentence := s.AnnotatedSentence(f)
	dep := sentence.Dependency()

	return dep
}

func (s SentenceTag) String() string {
	return s.Sentence.String()
}

func ShuffleSentenceTag(s []SentenceTag) []SentenceTag {
	rand.Seed(1337)
	for i := range s {
		j := rand.Intn(i + 1)
		s[i], s[j] = s[j], s[i]
	}

	return s
}

/* UTILITY FUNCTIONS */

func WrapLexemeSentence(sentence lingo.LexemeSentence) lingo.LexemeSentence {
	retSentence := lingo.NewLexemeSentence()
	retSentence = append(retSentence, lingo.StartLexeme())
	retSentence = append(retSentence, sentence...)
	retSentence = append(retSentence, lingo.RootLexeme())
	return retSentence
}

func WrapTags(tagList []lingo.POSTag) []lingo.POSTag {
	retVal := append([]lingo.POSTag{lingo.X}, tagList...)
	retVal = append(retVal, lingo.X)
	return retVal
}

func WrapHeads(heads []int) []int {
	retVal := append([]int{0}, heads...)
	retVal = append(retVal, 0)
	return retVal
}

func WrapDeps(deps []lingo.DependencyType) []lingo.DependencyType {
	retVal := append([]lingo.DependencyType{lingo.Dep}, deps...)
	retVal = append(retVal, lingo.Dep)
	return retVal
}


================================================
FILE: treebank/sentenceTag_test.go
================================================
package treebank

import (
	"strings"
	"testing"

	"github.com/stretchr/testify/assert"
)

func TestSentenceTag(t *testing.T) {
	assert := assert.New(t)
	readr := strings.NewReader(sampleConllu)
	st := ReadConllu(readr)[0]

	correctHeads := []int{2, 5, 4, 5, 0, 7, 5, 9, 5, 11, 9, 14, 14, 11, 18, 18, 18, 14, 5}
	assert.Equal(correctHeads, st.Heads)

	dep := st.Dependency(nil)
	assert.Equal(correctHeads, dep.Heads()[1:])
}


================================================
FILE: treebank/treebank.go
================================================
package treebank

import (
	"archive/zip"
	"io"
	"log"

	"github.com/chewxy/lingo"

	"bufio"
	"os"
	"strconv"
	"strings"
)

var empty struct{}

// Loader is anything that loads into a slice of SentenceTags. For future uses, to load tree banks
type Loader func(string) []SentenceTag

// LoadUniversal loads a treebank file formatted in a CONLLU format
func LoadUniversal(fileName string) []SentenceTag {
	f, err := os.Open(fileName)
	if err != nil {
		log.Printf("filename %q", fileName)
		panic(err)
	}
	defer f.Close()

	return ReadConllu(f)
}

// ReadConllu reads a file formatted in a CONLLU format
func ReadConllu(reader io.Reader) []SentenceTag {
	s, st, sh, sdt := reset()
	sentences := make([]SentenceTag, 0)
	sentenceCount := 0

	var usedTags lingo.TagSet
	var usedDepTypes lingo.DependencyTypeSet
	var unknownTags = make(map[string]struct{})
	var unknownDepType = make(map[string]struct{})

	colCount := 0
	for bs := bufio.NewScanner(reader); bs.Scan(); colCount++ {
		l := bs.Text()
		if strings.HasPrefix(l, "#") {
			// comments
			continue
		}
		if len(l) == 0 {
			// then this is a new sentence
			sentences = finish(s, st, sh, sdt, sentences)
			s, st, sh, sdt = reset()

			sentenceCount++
			continue
		}

		cols := strings.Split(l, "\t")
		word := cols[1]

		var tag string
		switch lingo.BUILD_TAGSET {
		case "stanfordtags":
			tag = cols[4]
		case "universaltags":
			tag = cols[3]
		default:
			panic("Unknown tagset")
		}

		head := cols[6]
		depType := cols[7]

		var t lingo.POSTag
		var dt lingo.DependencyType
		var h int
		var ok bool
		var err error

		word = lingo.UnescapeSpecials(word)

		lexType := StringToLexType(tag)
		if t, ok = StringToPOSTag(tag); ok {
			usedTags[t] = true
		} else {
			unknownTags[tag] = empty
		}

		if h, err = strconv.Atoi(head); err != nil {
			panic(err) // panic is the right option, because there is no default
		}

		if dt, ok = StringToDependencyType(depType); ok {
			usedDepTypes[dt] = true
		} else {
			unknownDepType[depType] = empty
		}

		lexeme := lingo.Lexeme{word, lexType, sentenceCount, colCount, 0} // TODO: add byte offset
		s = append(s, lexeme)
		st = append(st, t)
		sh = append(sh, h)
		sdt = append(sdt, dt)
	}
	return sentences
}

// LoadEWT loads a zipped English Web Treebank (as donated by Google)
func LoadEWT(filename string) []SentenceTag {

	r, err := zip.OpenReader(filename)
	if err != nil {
		panic(err)
	}
	defer r.Close()

	sentences := make([]SentenceTag, 0)

	for _, f := range r.File {
		contents, err := f.Open()
		if err != nil {
			panic(err)
		}
		sentences = append(sentences, ReadConllu(contents)...)
		contents.Close()
	}

	return sentences
}


================================================
FILE: treebank/treebank_test.go
================================================
package treebank

import (
	"strings"
	"testing"

	"github.com/chewxy/lingo"
	"github.com/stretchr/testify/assert"
)

const sampleConllu = `1	President	President	PROPN	NNP	Number=Sing	2	compound	_	_
2	Bush	Bush	PROPN	NNP	Number=Sing	5	nsubj	_	_
3	on	on	ADP	IN	_	4	case	_	_
4	Tuesday	Tuesday	PROPN	NNP	Number=Sing	5	nmod	_	_
5	nominated	nominate	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	0	root	_	_
6	two	two	NUM	CD	NumType=Card	7	nummod	_	_
7	individuals	individual	NOUN	NNS	Number=Plur	5	dobj	_	_
8	to	to	PART	TO	_	9	mark	_	_
9	replace	replace	VERB	VB	VerbForm=Inf	5	advcl	_	_
10	retiring	retire	VERB	VBG	VerbForm=Ger	11	amod	_	_
11	jurists	jurist	NOUN	NNS	Number=Plur	9	dobj	_	_
12	on	on	ADP	IN	_	14	case	_	_
13	federal	federal	ADJ	JJ	Degree=Pos	14	amod	_	_
14	courts	court	NOUN	NNS	Number=Plur	11	nmod	_	_
15	in	in	ADP	IN	_	18	case	_	_
16	the	the	DET	DT	Definite=Def|PronType=Art	18	det	_	_
17	Washington	Washington	PROPN	NNP	Number=Sing	18	compound	_	_
18	area	area	NOUN	NN	Number=Sing	14	nmod	_	_
19	.	.	PUNCT	.	_	5	punct	_	_

`

func Test_ReadConllu(t *testing.T) {
	assert := assert.New(t)
	st := ReadConllu(strings.NewReader(sampleConllu))[0]

	correctHeads := []int{2, 5, 4, 5, 0, 7, 5, 9, 5, 11, 9, 14, 14, 11, 18, 18, 18, 14, 5}
	assert.Equal(correctHeads, st.Heads)

	// we compare by string to avoid having to build two different test files
	var correctPOS []string
	if lingo.BUILD_TAGSET == "stanfordtags" {
		correctPOS = []string{
			"NNP",
			"NNP",
			"IN",
			"NNP",
			"VBD",
			"CD",
			"NNS",
			"TO",
			"VB",
			"VBG",
			"NNS",
			"IN",
			"JJ",
			"NNS",
			"IN",
			"DT",
			"NNP",
			"NN",
			"FULLSTOP",
		}
	} else {
		correctPOS = []string{
			"PROPN",
			"PROPN",
			"ADP",
			"PROPN",
			"VERB",
			"NUM",
			"NOUN",
			"PART",
			"VERB",
			"VERB",
			"NOUN",
			"ADP",
			"ADJ",
			"NOUN",
			"ADP",
			"DET",
			"PROPN",
			"NOUN",
			"PUNCT",
		}
	}

	assert.Equal(correctPOS, ttos(st.Tags))

	// the stanford tags are not listed in the CONLLU format
	if lingo.BUILD_RELSET != "stanfordrel" {
		var correctRel []string
		correctRel = []string{
			"Compound",
			"NSubj",
			"Case",
			"NMod",
			"Root",
			"NumMod",
			"DObj",
			"Mark",
			"AdvCl",
			"AMod",
			"DObj",
			"Case",
			"AMod",
			"NMod",
			"Case",
			"Det",
			"Compound",
			"NMod",
			"Punct",
		}

		assert.Equal(correctRel, ltos(st.Labels))
	}
}

func ttos(ts []lingo.POSTag) []string {
	retVal := make([]string, len(ts))
	for i, t := range ts {
		retVal[i] = t.String()
	}
	return retVal
}

func ltos(ls []lingo.DependencyType) []string {
	retVal := make([]string, len(ls))
	for i, l := range ls {
		retVal[i] = l.String()
	}
	return retVal
}


================================================
FILE: treebank/util.go
================================================
package treebank

import "github.com/chewxy/lingo"

var alreadyLogged map[string]bool = make(map[string]bool)

// TODO : CHECK
func StringToLexType(tag string) lingo.LexemeType {
	var lexType lingo.LexemeType
	switch tag {
	case "NUM":
		lexType = lingo.Number
	case "PUNCT":
		lexType = lingo.Punctuation
	case "SYM":
		lexType = lingo.Symbol
	default:
		lexType = lingo.Word
	}
	return lexType
}

func StringToPOSTag(tag string) (lingo.POSTag, bool) {
	t, ok := posTagTable[tag]

	return t, ok
}

func StringToDependencyType(ud string) (lingo.DependencyType, bool) {
	dt, ok := dependencyTable[ud]

	return dt, ok
}

func reset() (lingo.LexemeSentence, []lingo.POSTag, []int, []lingo.DependencyType) {
	s := lingo.NewLexemeSentence()
	st := make([]lingo.POSTag, 0)
	sh := make([]int, 0)
	sdt := make([]lingo.DependencyType, 0)

	return s, st, sh, sdt
}

func finish(s lingo.LexemeSentence, st []lingo.POSTag, sh []int, sdt []lingo.DependencyType, sentences []SentenceTag) []SentenceTag {
	sentenceTag := SentenceTag{s, st, sh, sdt}
	sentences = append(sentences, sentenceTag)

	return sentences
}


================================================
FILE: utils.go
================================================
package lingo

func InStringSlice(s string, l []string) bool {
	for _, v := range l {
		if s == v {
			return true
		}
	}
	return false
}

type is func(rune) bool

func StringIs(s string, f is) bool {
	for _, c := range s {
		if !f(c) {
			return false
		}
	}
	return true
}

func isAscii(r rune) bool {
	if r > 255 {
		return false
	}
	return true
}

func EqStringSlice(a, b []string) bool {
	if len(a) != len(b) {
		return false
	}

	for i, v := range a {
		if v != b[i] {
			return false
		}
	}
	return true
}


================================================
FILE: wordFlags.go
================================================
package lingo

import (
	"fmt"
	"strings"
	"unicode"
)

// WordFlags represent the types a word may be. A word may have multiple flags
type WordFlag uint32

const (
	NoFlag WordFlag = iota
	IsLetter
	IsAscii
	IsDigit
	IsLower
	IsPunct
	IsSpace
	IsTitle
	IsUpper
	LikeURL
	LikeNum
	LikeEmail
	IsStopWord
	IsOOV // for ner

	MAXFLAG
)

func (f WordFlag) String() string {
	return fmt.Sprintf("%014b", f)
}

func (l Lexeme) Flags() WordFlag {
	var wf WordFlag

	s := l.Value

	if StringIs(s, unicode.IsLetter) {
		wf |= (1 << IsLetter)
	}

	if StringIs(s, unicode.IsDigit) {
		wf |= (1 << IsDigit)
	}

	if StringIs(s, isAscii) {
		wf |= (1 << IsAscii)
	}

	if StringIs(s, unicode.IsLower) {
		wf |= (1 << IsLower)
	}

	if StringIs(s, unicode.IsPunct) {
		wf |= (1 << IsPunct)
	}

	if StringIs(s, unicode.IsSpace) {
		wf |= (1 << IsSpace)
	}

	if StringIs(s, unicode.IsUpper) {
		wf |= (1 << IsUpper)
	}

	if l.LexemeType == URI {
		wf |= (1 << LikeURL)
	}

	if _, ok := NumberWords[strings.ToLower(s)]; ok {
		wf |= (1 << LikeNum)
	}

	if _, ok := stopwords[s]; ok {
		wf |= (1 << IsStopWord)
	}

	if len(s) > 0 {
		if (unicode.IsUpper(rune(s[0])) || unicode.IsTitle(rune(s[0]))) && StringIs(s[1:], unicode.IsLower) {
			wf |= (1 << IsTitle)
		}
	}

	return wf
}